In [1]:
import pandas as pd
import numpy as np
import re
import string
import gensim
import spacy
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

In [2]:
# Load the table `compaints_users` for the complaint text and product ID, and the `products` table to find the product and sub-product
df = pd.read_csv('data/complaints_users.csv')
df2 = pd.read_csv('data/products.csv')
df = df.merge(df2, how='left', on='PRODUCT_ID')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Change sub-product of "I do not know" to null
df.loc[df['SUB_PRODUCT'] == 'I do not know', 'SUB_PRODUCT'] = np.nan

In [4]:
# Remove product IDs for which there have been no recent complaints
df['date_norm'] = df['DATE'].apply(lambda x: int(x[6:10] + x[0:2] + x[3:5]))
df = df[df['date_norm'] > 20180000]

In [5]:
# Remove complaints which have null values in either main or sub products
df = df[(df['MAIN_PRODUCT'].notnull()) & (df['SUB_PRODUCT'].notnull())]

In [6]:
# Remove product IDs which have fewer than 1000 complaints
df['COMPLAINT_COUNTS'] = df.groupby('SUB_PRODUCT')['COMPLAINT_ID'].transform('count')
df = df[df['COMPLAINT_COUNTS'] > 1000]

In [7]:
print(len(df), 'total rows')
print(len(df[df['COMPLAINT_COUNTS'] > 1000]['MAIN_PRODUCT'].unique()), 'unique main-products')
print(len(df[df['COMPLAINT_COUNTS'] > 1000]['SUB_PRODUCT'].unique()), 'unique sub-products')

118302 total rows
9 unique main-products
17 unique sub-products


In [10]:
for main_product in df['MAIN_PRODUCT'].unique():
    print(main_product)
#     print(len(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique()))
#     print(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique())
    for sub_product in df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique():
        print('   ', len(df[(df['MAIN_PRODUCT'] == main_product) & (df['SUB_PRODUCT'] == sub_product)]), sub_product)
    print()

Credit reporting, credit repair services, or other personal consumer reports
    54735 Credit reporting
    1030 Other personal consumer report
    1 Conventional home mortgage

Debt collection
    4964 Medical debt
    8401 Other debt
    6799 Credit card debt
    1123 Payday loan debt
    1095 Auto debt

Student loan
    3956 Federal student loan servicing
    2197 Private student loan

Credit card or prepaid card
    2403 Store credit card
    10359 General-purpose credit card or charge card

Mortgage
    6947 Conventional home mortgage
    2288 FHA mortgage

Checking or savings account
    6745 Checking account

Money transfer, virtual currency, or money service
    1129 Domestic (US) money transfer

Vehicle loan or lease
    3024 Loan

Payday loan, title loan, or personal loan
    1106 Installment loan



In [11]:
# Remove that possibly mis-classified "Convential home mortgage" in "Credit reporting..." main-product
df = (df[((df['MAIN_PRODUCT'] != 'Credit reporting, credit repair services, or other personal consumer reports') |
     (df['SUB_PRODUCT'] != 'Conventional home mortgage'))])

In [12]:
len(df)

118301

In [13]:
for main_product in df['MAIN_PRODUCT'].unique():
    print(main_product)
#     print(len(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique()))
#     print(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique())
    for sub_product in df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique():
        print('   ', len(df[(df['MAIN_PRODUCT'] == main_product) & (df['SUB_PRODUCT'] == sub_product)]), sub_product)
    print()

Credit reporting, credit repair services, or other personal consumer reports
    54735 Credit reporting
    1030 Other personal consumer report

Debt collection
    4964 Medical debt
    8401 Other debt
    6799 Credit card debt
    1123 Payday loan debt
    1095 Auto debt

Student loan
    3956 Federal student loan servicing
    2197 Private student loan

Credit card or prepaid card
    2403 Store credit card
    10359 General-purpose credit card or charge card

Mortgage
    6947 Conventional home mortgage
    2288 FHA mortgage

Checking or savings account
    6745 Checking account

Money transfer, virtual currency, or money service
    1129 Domestic (US) money transfer

Vehicle loan or lease
    3024 Loan

Payday loan, title loan, or personal loan
    1106 Installment loan



In [14]:
train, test = train_test_split(df[['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT']], test_size=.1)

In [15]:
len(train)

106470

In [16]:
len(test)

11831

In [17]:
train.head()

Unnamed: 0,COMPLAINT_TEXT,MAIN_PRODUCT,SUB_PRODUCT
110606,Creditor : XXXX XXXX Credit Bureau : Equifax ...,"Credit reporting, credit repair services, or o...",Credit reporting
118824,I received a SBA sponsored loan from Regions B...,"Payday loan, title loan, or personal loan",Installment loan
15974,Inquiry XXXX I received a promotional offer to...,Checking or savings account,Checking account
10573,When creditors pull my credit from Equifax ( o...,"Credit reporting, credit repair services, or o...",Credit reporting
54104,HSBC advertised that I would receive {$150.00}...,Credit card or prepaid card,General-purpose credit card or charge card


In [18]:
train_text = train['COMPLAINT_TEXT'].tolist()

In [None]:
nlp = spacy.load('en')
train_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in train_text]

In [None]:
train_text = [doc.lower().replace(r'\n', '',) for doc in train_text]

In [None]:
train_text = [re.sub(r"[^a-zA-Z$ -]+", '', doc) for doc in train_text]

In [None]:
stop_words = stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count=bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
    
def get_corpus(texts):
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

In [None]:
train_corpus, train_id2word, bigram_train = get_corpus(train_text)

In [None]:
number_of_topics = 30

lda_train = gensim.models.ldamulticore.LdaMulticore(
                           corpus=train_corpus,
                           num_topics=number_of_topics,
                           id2word=train_id2word,
                           chunksize=100,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
lda_train.save('lda_train.model')

In [None]:
lda_train.print_topics(number_of_topics, num_words=15)[:10]

In [None]:
tfidf_text_vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             lowercase=True,
                             norm='l2',
                             max_df=.9,
                             min_df=.1)
tfidf_text = tfidf_text_vectorizer.fit_transform(train['COMPLAINT_TEXT'])
text_cols = tfidf_text_vectorizer.get_feature_names()
tfidf_text = pd.DataFrame(tfidf_text.todense(),
                          columns=[text_cols])

In [None]:
top_topics = [[entry[1] for entry in lda_train.get_document_topics(train_corpus[i], minimum_probability=0.0)] for i in range(len(train_text))]
train_vecs = pd.DataFrame(columns=[f'topic_{i}' for i in range(number_of_topics)], data=top_topics)

In [None]:
train_df = pd.concat([train.reset_index(drop=True), train_vecs, tfidf_text], axis=1)

In [None]:
train_df.head()

## Send test data through final main-product model

In [29]:
# Apply SMOTE to all classes to have the same number of data points
smt = SMOTE()
X_train, y_train = smt.fit_sample(train_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1), train_df['MAIN_PRODUCT'])

# Scale Data
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)

random_forest = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=4,
    class_weight='balanced',
    n_jobs=-1
).fit(X_train_scale, y_train)

In [30]:
test_text = test['COMPLAINT_TEXT'].tolist()
test_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in test_text]
test_text = [doc.lower().replace(r'\n', '',) for doc in test_text]
test_text = [re.sub(r"[^a-zA-Z$ -]+", '', doc) for doc in test_text]

In [31]:
tfidf_text_test = tfidf_text_vectorizer.transform(test['COMPLAINT_TEXT'])
tfidf_text_test = pd.DataFrame(tfidf_text_test.todense(),
                               columns=[text_cols])

In [32]:
def get_bigram(texts):
    """
    For the test data we only need the bigram data built on train data,
    as we'll use the train id2word mappings. This is a requirement due to 
    the shapes Gensim functions expect in the test-vector transformation below.
    With both these in hand, we can make the test corpus.
    """
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram = bigrams(words)
    bigram = [bigram[review] for review in words]
    return bigram
  
bigram_test = get_bigram(test_text)

test_corpus = [train_id2word.doc2bow(text) for text in bigram_test]

top_topics = [[entry[1] for entry in lda_train.get_document_topics(test_corpus[i], minimum_probability=0.0)] for i in range(len(test_text))]
test_vecs = pd.DataFrame(columns=[f'topic_{i}' for i in range(number_of_topics)], data=top_topics)

test_df = pd.concat([test.reset_index(drop=True), test_vecs, tfidf_text_test], axis=1)

In [33]:
X_test = np.array(test_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))
y_test = np.array(test_df['MAIN_PRODUCT'])

X_test_scale = scaler.transform(X_test)

In [34]:
y_pred = random_forest.predict(X_test_scale)
rf_f1_test = f1_score(y_test, y_pred, average=None)
print(f'Random Forest Test f1: {np.mean(rf_f1_test):.3f} +/- {np.std(rf_f1_test):.3f}')

Random Forest Test f1: 0.671 +/- 0.251


In [35]:
feat = random_forest.feature_importances_
for i in range(number_of_topics):
    print('Topic', i, feat[i])

Topic 0 0.06349135381860481
Topic 1 0.04446204479209733
Topic 2 0.0038677444933119796
Topic 3 0.004084370873210003
Topic 4 0.041428115121806414
Topic 5 0.005960204363099697
Topic 6 0.02993647180911589
Topic 7 0.007412270575778321
Topic 8 0.006871983311316441
Topic 9 0.024182480601425225
Topic 10 0.005565830695858488
Topic 11 0.008672906119939986
Topic 12 0.004561762223484695
Topic 13 0.007434285442441586
Topic 14 0.01095607964899817
Topic 15 0.003906914921145605
Topic 16 0.031368869376899526
Topic 17 0.013385961761272897
Topic 18 0.0077889823786315366
Topic 19 0.0037384395929705447


## Send test data through final sub-product model

In [71]:
X_train = train_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1)
y_train = train_df['SUB_PRODUCT']

main_product = train_df['MAIN_PRODUCT']
labels = LabelEncoder()
z = labels.fit_transform(main_product)

X_train = np.concatenate((X_train, z.reshape(len(z), 1)), axis=1)

# Apply SMOTE to all classes to have the same number of data points
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)

# Scale Data
scaler_2 = StandardScaler()
X_train_scale = scaler_2.fit_transform(X_train)

random_forest_2 = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=4,
    class_weight='balanced',
    n_jobs=-1
).fit(X_train_scale, y_train)



X_test = np.array(test_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))
y_test = np.array(test_df['SUB_PRODUCT'])

main_product = random_forest.predict(X_test)
z = labels.transform(main_product)
X_test = np.concatenate((X_test, z.reshape(len(z), 1)), axis=1)

X_test_scale = scaler_2.transform(X_test)

y_pred = random_forest_2.predict(X_test_scale)
rf_f1_test = f1_score(y_test, y_pred, average=None)
print(f'Random Forest Test f1: {np.mean(rf_f1_test):.3f} +/- {np.std(rf_f1_test):.3f}')

Random Forest Test f1: 0.256 +/- 0.207


  'precision', 'predicted', average, warn_for)


In [72]:
feat = random_forest_2.feature_importances_
for i in range(number_of_topics):
    print('Topic', i, feat[i])
print('Main-product prediction', feat[-1])

Topic 0 0.033247422669671005
Topic 1 0.024860140095903375
Topic 2 0.004242565963317391
Topic 3 0.004526497225359549
Topic 4 0.022885324016831653
Topic 5 0.0059284363197458214
Topic 6 0.018833586285517442
Topic 7 0.006290588015848359
Topic 8 0.005976243270436246
Topic 9 0.018765337590924625
Topic 10 0.006907409470613672
Topic 11 0.009211821451109502
Topic 12 0.004662545837562532
Topic 13 0.009516267064659007
Topic 14 0.006190658979237472
Topic 15 0.005371819137947893
Topic 16 0.018730482340298442
Topic 17 0.010487395600459182
Topic 18 0.007662127998519065
Topic 19 0.003950319143307471
Main-product prediction 0.13067493181555678


In [73]:
from collections import defaultdict

In [74]:
d1 = defaultdict(int)
for prod in test['MAIN_PRODUCT']:
    d1[prod] += 1
d1

defaultdict(int,
            {'Credit reporting, credit repair services, or other personal consumer reports': 5557,
             'Debt collection': 2249,
             'Mortgage': 944,
             'Credit card or prepaid card': 1311,
             'Student loan': 608,
             'Payday loan, title loan, or personal loan': 94,
             'Vehicle loan or lease': 315,
             'Checking or savings account': 643,
             'Money transfer, virtual currency, or money service': 110})

In [75]:
d2 = defaultdict(int)
for pred in random_forest.predict(np.array(test_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))):
    d2[pred] += 1
d2

defaultdict(int,
            {'Vehicle loan or lease': 10272,
             'Debt collection': 758,
             'Credit card or prepaid card': 549,
             'Student loan': 149,
             'Mortgage': 103})

In [76]:
d3 = defaultdict(int)
for prod in test['SUB_PRODUCT']:
    d3[prod] += 1
d3

defaultdict(int,
            {'Credit reporting': 5458,
             'Credit card debt': 648,
             'FHA mortgage': 222,
             'General-purpose credit card or charge card': 1069,
             'Federal student loan servicing': 384,
             'Medical debt': 522,
             'Conventional home mortgage': 722,
             'Other debt': 861,
             'Installment loan': 94,
             'Loan': 315,
             'Checking account': 643,
             'Private student loan': 224,
             'Domestic (US) money transfer': 110,
             'Other personal consumer report': 99,
             'Payday loan debt': 114,
             'Store credit card': 242,
             'Auto debt': 104})

In [77]:
d4 = defaultdict(int)
for pred in random_forest_2.predict(X_test):
    d4[pred] += 1
d4

defaultdict(int,
            {'Loan': 11273,
             'Credit card debt': 549,
             'Private student loan': 9})