In [1]:
import pandas as pd
import numpy as np
import re
import string
import gensim
import spacy
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

In [2]:
# Load the table `compaints_users` for the complaint text and product ID, and the `products` table to find the product and sub-product
df = pd.read_csv('data/complaints_users.csv')
df2 = pd.read_csv('data/products.csv')
df = df.merge(df2, how='left', on='PRODUCT_ID')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Change sub-product of "I do not know" to null
df.loc[df['SUB_PRODUCT'] == 'I do not know', 'SUB_PRODUCT'] = np.nan

In [4]:
# Remove product IDs for which there have been no recent complaints
df['date_norm'] = df['DATE'].apply(lambda x: int(x[6:10] + x[0:2] + x[3:5]))
df = df[df['date_norm'] > 20180000]

In [5]:
# Remove complaints which have null values in either main or sub products
df = df[(df['MAIN_PRODUCT'].notnull()) & (df['SUB_PRODUCT'].notnull())]

In [6]:
# Remove product IDs which have fewer than 1000 complaints
df['COMPLAINT_COUNTS'] = df.groupby('SUB_PRODUCT')['COMPLAINT_ID'].transform('count')
df = df[df['COMPLAINT_COUNTS'] > 1000]

In [7]:
print(len(df), 'total rows')
print(len(df[df['COMPLAINT_COUNTS'] > 1000]['MAIN_PRODUCT'].unique()), 'unique main-products')
print(len(df[df['COMPLAINT_COUNTS'] > 1000]['SUB_PRODUCT'].unique()), 'unique sub-products')

118302 total rows
9 unique main-products
17 unique sub-products


In [8]:
for main_product in df['MAIN_PRODUCT'].unique():
    print(main_product)
    print(len(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique()))
    print(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique())
    for sub_product in df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique():
        print(len(df[(df['MAIN_PRODUCT'] == main_product) & (df['SUB_PRODUCT'] == sub_product)]), sub_product)
    print()

Credit reporting, credit repair services, or other personal consumer reports
3
['Credit reporting' 'Other personal consumer report'
 'Conventional home mortgage']
54735 Credit reporting
1030 Other personal consumer report
1 Conventional home mortgage

Debt collection
5
['Medical debt' 'Other debt' 'Credit card debt' 'Payday loan debt'
 'Auto debt']
4964 Medical debt
8401 Other debt
6799 Credit card debt
1123 Payday loan debt
1095 Auto debt

Student loan
2
['Federal student loan servicing' 'Private student loan']
3956 Federal student loan servicing
2197 Private student loan

Credit card or prepaid card
2
['Store credit card' 'General-purpose credit card or charge card']
2403 Store credit card
10359 General-purpose credit card or charge card

Mortgage
2
['Conventional home mortgage' 'FHA mortgage']
6947 Conventional home mortgage
2288 FHA mortgage

Checking or savings account
1
['Checking account']
6745 Checking account

Money transfer, virtual currency, or money service
1
['Domestic (US

In [9]:
# Remove that possibly mis-classified "Convential home mortgage" in "Credit reporting..." main-product
df = (df[((df['MAIN_PRODUCT'] != 'Credit reporting, credit repair services, or other personal consumer reports') |
     (df['SUB_PRODUCT'] != 'Conventional home mortgage'))])

In [162]:
def rebalance(df, column, sample_size):
    new_df = pd.DataFrame(columns=df.columns)
    for group in df[column].unique():
        sample = sample_size
        data = len(df[df[column] == group])
        if data <= sample:
            new_df = new_df.append(df[df[column] == group])
            sample -= data
        new_df = new_df.append(resample(df[df[column] == group], n_samples=sample))
    return new_df.sample(frac=1).reset_index(drop=True)

In [163]:
len(df)

201656

In [164]:
for main_product in df['MAIN_PRODUCT'].unique():
    print(main_product)
    print(len(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique()))
    print(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique())
    for sub_product in df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique():
        print(len(df[(df['MAIN_PRODUCT'] == main_product) & (df['SUB_PRODUCT'] == sub_product)]), sub_product)
    print()

Credit reporting, credit repair services, or other personal consumer reports
1
['Credit reporting']
89994 Credit reporting

Student loan
1
['Federal student loan servicing']
12138 Federal student loan servicing

Debt collection
3
['Other debt' 'Credit card debt' 'Other (i.e. phone, health club, etc.)']
13385 Other debt
10021 Credit card debt
12393 Other (i.e. phone, health club, etc.)

Mortgage
2
['Conventional home mortgage' 'Conventional fixed mortgage']
12098 Conventional home mortgage
14562 Conventional fixed mortgage

Checking or savings account
1
['Checking account']
10709 Checking account

Credit card or prepaid card
1
['General-purpose credit card or charge card']
16243 General-purpose credit card or charge card

Bank account or service
1
['Checking account']
10113 Checking account



In [10]:
train, test = train_test_split(df[['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT']], test_size=.1)

In [11]:
len(train)

106470

In [12]:
len(test)

11831

In [13]:
train.head()

Unnamed: 0,COMPLAINT_TEXT,MAIN_PRODUCT,SUB_PRODUCT
98771,Because my husband passed away onXX/XX/XXXX an...,Credit card or prepaid card,General-purpose credit card or charge card
94914,XXXX XXXX XXXX XXXX stated that I owe Student ...,"Credit reporting, credit repair services, or o...",Credit reporting
15523,I applied online and I was approved for {$970....,"Payday loan, title loan, or personal loan",Installment loan
130459,There are many mistakes appear in my report wi...,"Credit reporting, credit repair services, or o...",Credit reporting
27717,"Back in XXXX XXXX, I was contacted by a Law Fi...",Debt collection,Credit card debt


In [169]:
train_text = train['COMPLAINT_TEXT'].tolist()

In [170]:
nlp = spacy.load('en')
train_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in train_text]

In [171]:
train_text = [doc.lower().replace(r'\n', '',) for doc in train_text]

In [172]:
train_text = [re.sub(r"[^a-zA-Z$ -]+", '', doc) for doc in train_text]

In [173]:
stop_words = stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count=bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
    
def get_corpus(texts):
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

In [174]:
train_corpus, train_id2word, bigram_train = get_corpus(train_text)

In [175]:
number_of_topics = 20

lda_train = gensim.models.ldamulticore.LdaMulticore(
                           corpus=train_corpus,
                           num_topics=number_of_topics,
                           id2word=train_id2word,
                           chunksize=100,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
lda_train.save('lda_train.model')

In [176]:
lda_train.print_topics(number_of_topics, num_words=15)[:10]

[(0,
  '0.043*"experian" + 0.036*"verify" + 0.026*"dispute" + 0.025*"delete" + 0.021*"item" + 0.020*"fcra" + 0.020*"please" + 0.019*"provide" + 0.019*"information" + 0.016*"remove" + 0.015*"request" + 0.015*"section" + 0.014*"file" + 0.014*"law" + 0.013*"reporting_act"'),
 (1,
  '0.390*"bank" + 0.222*"chase" + 0.086*"us" + 0.010*"usa" + 0.010*"customer" + 0.007*"freedom" + 0.006*"jp_morgan" + 0.006*"enhanced_recovery" + 0.005*"banks" + 0.005*"banking" + 0.005*"united" + 0.005*"banker" + 0.004*"department" + 0.004*"claim" + 0.004*"branch"'),
 (2,
  '0.153*"payment" + 0.048*"late" + 0.042*"pay" + 0.038*"make" + 0.027*"month" + 0.018*"amount" + 0.018*"due" + 0.017*"day" + 0.016*"balance" + 0.015*"fee" + 0.013*"statement" + 0.013*"time" + 0.012*"monthly" + 0.011*"interest" + 0.011*"show"'),
 (3,
  '0.089*"letter" + 0.081*"send" + 0.062*"receive" + 0.040*"request" + 0.026*"state" + 0.024*"mail" + 0.021*"copy" + 0.021*"document" + 0.017*"response" + 0.016*"provide" + 0.015*"information" + 0.

In [177]:
tfidf_text_vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             lowercase=True,
                             norm='l2',
                             max_df=.9,
                             min_df=.1)
tfidf_text = tfidf_text_vectorizer.fit_transform(train['COMPLAINT_TEXT'])
text_cols = tfidf_text_vectorizer.get_feature_names()
tfidf_text = pd.DataFrame(tfidf_text.todense(),
                          columns=[text_cols])

In [178]:
top_topics = [[entry[1] for entry in lda_train.get_document_topics(train_corpus[i], minimum_probability=0.0)] for i in range(len(train_text))]
train_vecs = pd.DataFrame(columns=[f'topic_{i}' for i in range(number_of_topics)], data=top_topics)

In [179]:
train_df = pd.concat([train.reset_index(drop=True), train_vecs, tfidf_text], axis=1)

In [180]:
train_df.head()

Unnamed: 0,COMPLAINT_TEXT,MAIN_PRODUCT,SUB_PRODUCT,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,"(xx xx,)","(xx xxxx,)","(xxxx,)","(xxxx and,)","(xxxx the,)","(xxxx to,)","(xxxx xxxx,)","(years,)","(you,)","(your,)"
0,It 's now been more than six months since we a...,Credit card or prepaid card,General-purpose credit card or charge card,0.001191,0.001191,0.237565,0.001191,0.092457,0.001191,0.120338,...,0.0,0.0,0.03715,0.0,0.0,0.09743,0.0,0.0,0.0,0.0
1,P Morgan Chase allow criminals to steal {$2400...,Checking or savings account,Checking account,0.000162,0.058841,0.000162,0.181336,0.022928,0.000162,0.176606,...,0.236106,0.279961,0.407657,0.065442,0.051346,0.032397,0.166251,0.0,0.0,0.0
2,"Hi, I made a mortgage payment twice accidently...",Mortgage,Conventional home mortgage,0.001042,0.001042,0.106817,0.001042,0.001042,0.001042,0.414505,...,0.171865,0.203788,0.137879,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,I faxed a letter to XXXX last XXXX of XXXX to...,"Credit reporting, credit repair services, or o...",Credit reporting,0.212922,0.000944,0.000944,0.427657,0.031909,0.159266,0.000944,...,0.094112,0.055796,0.481319,0.049996,0.0,0.074253,0.326604,0.0,0.0,0.0
4,XX/XX/2017 REPORTING AN INQUIRY ON MY REPORT,"Credit reporting, credit repair services, or o...",Credit reporting,0.025,0.025,0.025,0.025,0.025,0.025,0.025,...,0.307839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [181]:
# Bootstrap / downsample all classes to have the same number of data points
rebalanced = rebalance(X_train, 'MAIN_PRODUCT', 10000)

## Send test data through final main-product model

In [182]:
X_train = np.array(rebalanced.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))
y_train = np.array(rebalanced['MAIN_PRODUCT'])

# Scale Data
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)

random_forest = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=4,
    class_weight='balanced',
    n_jobs=-1
).fit(X_train_scale, y_train)

In [183]:
test_text = test['COMPLAINT_TEXT'].tolist()
test_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in test_text]
test_text = [doc.lower().replace(r'\n', '',) for doc in test_text]
test_text = [re.sub(r"[^a-zA-Z$ -]+", '', doc) for doc in test_text]

In [184]:
tfidf_text_test = tfidf_text_vectorizer.transform(test['COMPLAINT_TEXT'])
tfidf_text_test = pd.DataFrame(tfidf_text_test.todense(),
                               columns=[text_cols])

In [185]:
def get_bigram(texts):
    """
    For the test data we only need the bigram data built on train data,
    as we'll use the train id2word mappings. This is a requirement due to 
    the shapes Gensim functions expect in the test-vector transformation below.
    With both these in hand, we can make the test corpus.
    """
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram = bigrams(words)
    bigram = [bigram[review] for review in words]
    return bigram
  
bigram_test = get_bigram(test_text)

test_corpus = [train_id2word.doc2bow(text) for text in bigram_test]

top_topics = [[entry[1] for entry in lda_train.get_document_topics(test_corpus[i], minimum_probability=0.0)] for i in range(len(test_text))]
test_vecs = pd.DataFrame(columns=[f'topic_{i}' for i in range(number_of_topics)], data=top_topics)

test_df = pd.concat([test.reset_index(drop=True), test_vecs, tfidf_text_test], axis=1)

In [186]:
X_test = np.array(test_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))
y_test = np.array(test_df['MAIN_PRODUCT'])

X_test_scale = scaler.transform(X_test)

In [187]:
y_pred = random_forest.predict(X_test_scale)
rf_f1_test = f1_score(y_test, y_pred, average=None)
print(f'Random Forest Test f1: {np.mean(rf_f1_test):.3f} +/- {np.std(rf_f1_test):.3f}')

Random Forest Test f1: 0.752 +/- 0.110


In [188]:
feat = random_forest.feature_importances_
for i in range(number_of_topics):
    print('Topic', i, feat[i])

Topic 0 0.012464334495036167
Topic 1 0.006985890733496471
Topic 2 0.011738427690271022
Topic 3 0.00513387219436277
Topic 4 0.008193774258503176
Topic 5 0.020707538649739815
Topic 6 0.05052860388473234
Topic 7 0.0051188778882185855
Topic 8 0.022637753824300475
Topic 9 0.031071676819971375
Topic 10 0.010547749866741462
Topic 11 0.006217632441264317
Topic 12 0.0051393136654502925
Topic 13 0.012529654077762707
Topic 14 0.0664739293002951
Topic 15 0.007197774151005829
Topic 16 0.021878988142118375
Topic 17 0.06868823748886127
Topic 18 0.006968779816198093
Topic 19 0.005997506049956088


## Send test data through final sub-product model

In [189]:
# Bootstrap / downsample all classes to have the same number of data points
rebalanced = rebalance(train_df, 'SUB_PRODUCT', 10000)

X_train = np.array(rebalanced.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))
y_train = np.array(rebalanced['SUB_PRODUCT'])

main_product = rebalanced['MAIN_PRODUCT']
labels = LabelEncoder()
z = labels.fit_transform(main_product)
X_train = np.concatenate((X_train, z.reshape(len(z), 1)), axis=1)

# Scale Data
scaler_2 = StandardScaler()
X_train_scale = scaler_2.fit_transform(X_train)

random_forest_2 = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=4,
    class_weight='balanced',
    n_jobs=-1
).fit(X_train_scale, y_train)



X_test = np.array(test_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))
y_test = np.array(test_df['SUB_PRODUCT'])

main_product = random_forest.predict(X_test)
z = labels.transform(main_product)
X_test = np.concatenate((X_test, z.reshape(len(z), 1)), axis=1)

X_test_scale = scaler_2.transform(X_test)

y_pred = random_forest_2.predict(X_test_scale)
rf_f1_test = f1_score(y_test, y_pred, average=None)
print(f'Random Forest Test f1: {np.mean(rf_f1_test):.3f} +/- {np.std(rf_f1_test):.3f}')

Random Forest Test f1: 0.501 +/- 0.202


In [190]:
feat = random_forest_2.feature_importances_
for i in range(number_of_topics):
    print('Topic', i, feat[i])
print('Main-product prediction', feat[-1])

Topic 0 0.009675593424632185
Topic 1 0.00610634626388659
Topic 2 0.008771740969758296
Topic 3 0.005612843386573118
Topic 4 0.006966416104105308
Topic 5 0.014392631912279822
Topic 6 0.030159035435433634
Topic 7 0.0053248368113883275
Topic 8 0.01971917907964939
Topic 9 0.024829578612542254
Topic 10 0.008806374327569817
Topic 11 0.006461892379303025
Topic 12 0.005420422841277358
Topic 13 0.010914258504141475
Topic 14 0.03872405578001432
Topic 15 0.007230260210116026
Topic 16 0.021593228522847084
Topic 17 0.04720126190228999
Topic 18 0.00624641394554231
Topic 19 0.007685876150186542
Main-product prediction 0.16799294880169588


In [191]:
from collections import defaultdict

In [192]:
d1 = defaultdict(int)
for prod in test['MAIN_PRODUCT']:
    d1[prod] += 1
d1

defaultdict(int,
            {'Credit reporting, credit repair services, or other personal consumer reports': 8988,
             'Credit card or prepaid card': 1658,
             'Bank account or service': 998,
             'Debt collection': 3588,
             'Checking or savings account': 1100,
             'Mortgage': 2614,
             'Student loan': 1220})

In [196]:
d2 = defaultdict(int)
for pred in random_forest.predict(np.array(test_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))):
    d2[pred] += 1
d2

defaultdict(int,
            {'Credit card or prepaid card': 15448,
             'Credit reporting, credit repair services, or other personal consumer reports': 750,
             'Checking or savings account': 1988,
             'Debt collection': 339,
             'Mortgage': 1597,
             'Student loan': 44})

In [None]:
d3 = defaultdict(int)
for prod in test['SUB_PRODUCT']:
    d3[prod] += 1
d3

In [None]:
d4 = defaultdict(int)
for pred in random_forest_2.predict(X_test):
    d4[pred] += 1
d4