In [170]:
import pandas as pd
import numpy as np
import re
import string
import gensim
import spacy
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

In [171]:
# Load the table `compaints_users` for the complaint text and product ID, and the `products` table to find the product and sub-product
df = pd.read_csv('data/complaints_users.csv')
df2 = pd.read_csv('data/products.csv')
df = df.merge(df2, how='left', on='PRODUCT_ID')

  interactivity=interactivity, compiler=compiler, result=result)


In [172]:
# Change sub-product of "I do not know" to null
df.loc[df['SUB_PRODUCT'] == 'I do not know', 'SUB_PRODUCT'] = np.nan

In [173]:
# # Remove product IDs for which there have been no recent complaints
# df['date_norm'] = df['DATE'].apply(lambda x: int(x[6:10] + x[0:2] + x[3:5]))
# df = df[df['date_norm'] > 20180000]

In [174]:
# Remove complaints which have null values in either main or sub products
df = df[(df['MAIN_PRODUCT'].notnull()) & (df['SUB_PRODUCT'].notnull())]

In [175]:
# Remove product IDs which have fewer than 100 complaints
df['COMPLAINT_COUNTS'] = df.groupby('SUB_PRODUCT')['COMPLAINT_ID'].transform('count')
df = df[df['COMPLAINT_COUNTS'] > 2000]

In [176]:
print(len(df), 'total rows')
print(len(df[df['COMPLAINT_COUNTS'] > 2000]['MAIN_PRODUCT'].unique()), 'unique main-products')
print(len(df[df['COMPLAINT_COUNTS'] > 2000]['SUB_PRODUCT'].unique()), 'unique sub-products')

112819 total rows
7 unique main-products
12 unique sub-products


In [177]:
for main_product in df['MAIN_PRODUCT'].unique():
    print(main_product)
    print(len(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique()))
    print(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique())
    for sub_product in df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique():
        print(len(df[(df['MAIN_PRODUCT'] == main_product) & (df['SUB_PRODUCT'] == sub_product)]), sub_product)
    print()

Credit reporting, credit repair services, or other personal consumer reports
2
['Credit reporting' 'Conventional home mortgage']
54735 Credit reporting
1 Conventional home mortgage

Debt collection
3
['Medical debt' 'Other debt' 'Credit card debt']
4964 Medical debt
8401 Other debt
6799 Credit card debt

Student loan
2
['Federal student loan servicing' 'Private student loan']
3956 Federal student loan servicing
2197 Private student loan

Credit card or prepaid card
2
['Store credit card' 'General-purpose credit card or charge card']
2403 Store credit card
10359 General-purpose credit card or charge card

Mortgage
2
['Conventional home mortgage' 'FHA mortgage']
6947 Conventional home mortgage
2288 FHA mortgage

Checking or savings account
1
['Checking account']
6745 Checking account

Vehicle loan or lease
1
['Loan']
3024 Loan



In [178]:
# Remove that possibly mis-classified "Convential home mortgage" in "Credit reporting..." main-product
df = (df[((df['MAIN_PRODUCT'] != 'Credit reporting, credit repair services, or other personal consumer reports') |
     (df['SUB_PRODUCT'] != 'Conventional home mortgage'))])

In [179]:
def rebalance(df, column, sample_size):
    new_df = pd.DataFrame(columns=df.columns)
    for group in df[column].unique():
        sample = sample_size
        data = len(df[df[column] == group])
        if data <= sample:
            new_df = new_df.append(df[df[column] == group])
            sample -= data
        new_df = new_df.append(resample(df[df[column] == group], n_samples=sample))
    return new_df.sample(frac=1).reset_index(drop=True)

In [None]:
# Bootstrap / downsample all classes to have the same number of data points
df = rebalance(df, 'MAIN_PRODUCT', 10000)

In [181]:
len(df)

70000

In [182]:
for main_product in df['MAIN_PRODUCT'].unique():
    print(main_product)
    print(len(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique()))
    print(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique())
    for sub_product in df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique():
        print(len(df[(df['MAIN_PRODUCT'] == main_product) & (df['SUB_PRODUCT'] == sub_product)]), sub_product)
    print()

Checking or savings account
1
['Checking account']
10000 Checking account

Vehicle loan or lease
1
['Loan']
10000 Loan

Mortgage
2
['FHA mortgage' 'Conventional home mortgage']
2474 FHA mortgage
7526 Conventional home mortgage

Credit card or prepaid card
2
['Store credit card' 'General-purpose credit card or charge card']
1943 Store credit card
8057 General-purpose credit card or charge card

Debt collection
3
['Credit card debt' 'Other debt' 'Medical debt']
3412 Credit card debt
4134 Other debt
2454 Medical debt

Student loan
2
['Federal student loan servicing' 'Private student loan']
6394 Federal student loan servicing
3606 Private student loan

Credit reporting, credit repair services, or other personal consumer reports
1
['Credit reporting']
10000 Credit reporting



In [183]:
train, test = train_test_split(df[['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT']], test_size=.1)

In [184]:
len(train)

63000

In [185]:
len(test)

7000

In [186]:
train.head()

Unnamed: 0,COMPLAINT_TEXT,MAIN_PRODUCT,SUB_PRODUCT
8125,bait and switch company. They presented me wit...,Student loan,Private student loan
5941,We accidentally paid XX/XX/XXXX mortgage payme...,Mortgage,FHA mortgage
62115,I have missed my XXXX mortgage payment. I have...,Mortgage,FHA mortgage
24155,"PHH cashed a check for the wrong amount, it wa...",Mortgage,Conventional home mortgage
36417,I have three hard inquires one is from XXXX XX...,"Credit reporting, credit repair services, or o...",Credit reporting


In [187]:
train['LENGTH'] = train['COMPLAINT_TEXT'].apply(len)

In [188]:
train_text = train['COMPLAINT_TEXT'].tolist()
train_len = train['LENGTH'].tolist()

In [189]:
nlp = spacy.load('en')
train_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in train_text]

In [190]:
train_text = [doc.lower().replace(r'\n', '',) for doc in train_text]

In [191]:
train_text = [re.sub(r"[^a-zA-Z$ -]+", '', doc) for doc in train_text]

In [192]:
stop_words = stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count=bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
    
def get_corpus(texts):
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

In [193]:
train_corpus, train_id2word, bigram_train = get_corpus(train_text)

In [194]:
number_of_topics = 20

lda_train = gensim.models.ldamulticore.LdaMulticore(
                           corpus=train_corpus,
                           num_topics=number_of_topics,
                           id2word=train_id2word,
                           chunksize=100,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
lda_train.save('lda_train.model')

In [195]:
lda_train.print_topics(number_of_topics, num_words=15)[:10]

[(0,
  '0.094*"check" + 0.044*"money" + 0.039*"send" + 0.038*"refund" + 0.029*"day" + 0.028*"back" + 0.026*"company" + 0.025*"take" + 0.024*"amount" + 0.020*"say" + 0.016*"bank" + 0.016*"return" + 0.015*"issue" + 0.014*"month" + 0.014*"cancel"'),
 (1,
  '0.017*"court" + 0.017*"law" + 0.014*"file" + 0.014*"state" + 0.012*"act" + 0.010*"attorney" + 0.010*"legal" + 0.010*"right" + 0.009*"case" + 0.009*"action" + 0.008*"consumer" + 0.008*"complaint" + 0.008*"violation" + 0.008*"include" + 0.007*"office"'),
 (2,
  '0.077*"report" + 0.051*"information" + 0.031*"remove" + 0.027*"dispute" + 0.024*"consumer" + 0.024*"inquiry" + 0.021*"file" + 0.020*"verify" + 0.017*"equifax" + 0.015*"experian" + 0.014*"item" + 0.014*"request" + 0.013*"delete" + 0.012*"please" + 0.012*"transunion"'),
 (3,
  '0.112*"chase" + 0.072*"number" + 0.064*"address" + 0.058*"name" + 0.025*"us" + 0.024*"information" + 0.022*"bank" + 0.018*"social_security" + 0.018*"phone" + 0.015*"mail" + 0.014*"open" + 0.013*"use" + 0.013

In [196]:
tfidf_text_vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             lowercase=True,
                             norm='l2',
                             max_df=.9,
                             min_df=.1)
tfidf_text = tfidf_text_vectorizer.fit_transform(train['COMPLAINT_TEXT'])
text_cols = tfidf_text_vectorizer.get_feature_names()
tfidf_text = pd.DataFrame(tfidf_text.todense(),
                          columns=[text_cols])

In [214]:
train_vecs = []
for i in range(len(train_text)):
    top_topics = lda_train.get_document_topics(train_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(number_of_topics)]
    topic_vec.extend(tfidf_text.iloc[i])  # TF-IDF vectors
    train_vecs.append(topic_vec)

In [None]:
# Bootstrap / downsample all classes to have the same number of data points
df = rebalance(df, 'MAIN_PRODUCT', 10000)

## Send test data through final main-product model

In [229]:
X_train = np.array(train_vecs)
y_train = np.array(train['MAIN_PRODUCT'])

# Scale Data
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)

random_forest = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=100,
    min_samples_split=4,
    class_weight='balanced',
    n_jobs=-1
).fit(X_train_scale, y_train)

In [230]:
test['LENGTH'] = test['COMPLAINT_TEXT'].apply(len)
test_text = test['COMPLAINT_TEXT'].tolist()
test_len = test['LENGTH'].tolist()
test_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in test_text]
test_text = [doc.lower().replace(r'\n', '',) for doc in test_text]
test_text = [re.sub(r"[^a-zA-Z$ -]+", '', doc) for doc in test_text]

In [231]:
tfidf_text_test = tfidf_text_vectorizer.transform(test['COMPLAINT_TEXT'])
tfidf_text_test = pd.DataFrame(tfidf_text_test.todense(),
                               columns=[text_cols])

In [232]:
def get_bigram(texts):
    """
    For the test data we only need the bigram data built on train data,
    as we'll use the train id2word mappings. This is a requirement due to 
    the shapes Gensim functions expect in the test-vector transformation below.
    With both these in hand, we can make the test corpus.
    """
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram = bigrams(words)
    bigram = [bigram[review] for review in words]
    return bigram
  
bigram_test = get_bigram(test_text)

test_corpus = [train_id2word.doc2bow(text) for text in bigram_test]

test_vecs = []
for i in range(len(test_text)):
    top_topics = lda_train.get_document_topics(test_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(number_of_topics)]
    topic_vec.extend(tfidf_text_test.iloc[i])
    test_vecs.append(topic_vec)

In [233]:
X_test = np.array(test_vecs)
y_test = np.array(test['MAIN_PRODUCT'])

X_test_scale = scaler.transform(X_test)

In [234]:
y_pred = random_forest.predict(X_test_scale)
rf_f1_test = f1_score(y_test, y_pred, average=None)
print(f'Random Forest Test f1: {np.mean(rf_f1_test):.3f} +/- {np.std(rf_f1_test):.3f}')

Random Forest Test f1: 0.823 +/- 0.068


In [235]:
feat = random_forest.feature_importances_
for i in range(number_of_topics):
    print('Topic', i, feat[i])
# print('Comment length', feat[-1])

Topic 0 0.0009462366793901502
Topic 1 0.004309610118597874
Topic 2 0.04614970639769628
Topic 3 0.0018641663899472297
Topic 4 0.030565393463520496
Topic 5 0.0013262231637497942
Topic 6 0.09839080030478634
Topic 7 0.001272344125329751
Topic 8 0.010889068443024225
Topic 9 0.011832365934081952
Topic 10 0.007646975270470209
Topic 11 0.005887722545905707
Topic 12 0.0616129033738358
Topic 13 0.08977795077409001
Topic 14 0.0013445573244966836
Topic 15 0.06738484568168268
Topic 16 0.0013321409349822285
Topic 17 0.08444323350243414
Topic 18 0.013147649239968093
Topic 19 0.0009240718039698565


## Send test data through final sub-product model

In [236]:
X_train = np.array(train_vecs)
y_train = np.array(train['SUB_PRODUCT'])

main_product = train['MAIN_PRODUCT']
labels = LabelEncoder()
z = labels.fit_transform(main_product)
X_train = np.concatenate((X_train, z.reshape(len(z), 1)), axis=1)

# Scale Data
scaler_2 = StandardScaler()
X_train_scale = scaler_2.fit_transform(X_train)

random_forest_2 = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=100,
    min_samples_split=4,
    class_weight='balanced',
    n_jobs=-1
).fit(X_train_scale, y_train)



X_test = np.array(test_vecs)
y_test = np.array(test['SUB_PRODUCT'])

main_product = random_forest.predict(X_test)
z = labels.transform(main_product)
X_test = np.concatenate((X_test, z.reshape(len(z), 1)), axis=1)

X_test_scale = scaler_2.transform(X_test)

y_pred = random_forest_2.predict(X_test_scale)
rf_f1_test = f1_score(y_test, y_pred, average=None)
print(f'Random Forest Test f1: {np.mean(rf_f1_test):.3f} +/- {np.std(rf_f1_test):.3f}')

Random Forest Test f1: 0.576 +/- 0.153


In [237]:
feat = random_forest_2.feature_importances_
for i in range(number_of_topics):
    print('Topic', i, feat[i])
# print('Comment length', feat[-2])
print('Main-product prediction', feat[-1])

Topic 0 0.0014033597613339977
Topic 1 0.005421063864969904
Topic 2 0.026689969814241443
Topic 3 0.002411994616475524
Topic 4 0.02380467011628789
Topic 5 0.0017234509822747716
Topic 6 0.07068095209720922
Topic 7 0.0018647835775037003
Topic 8 0.0075337435385669956
Topic 9 0.012436149823865098
Topic 10 0.0073302795907105225
Topic 11 0.006140538387011782
Topic 12 0.039253916005833896
Topic 13 0.0625501672187005
Topic 14 0.002079433326161963
Topic 15 0.05896719801931236
Topic 16 0.004856527095735507
Topic 17 0.05144747427917818
Topic 18 0.011966488230677533
Topic 19 0.0017719028533311858
Main-product prediction 0.2234067479526026


In [238]:
from collections import defaultdict

In [239]:
d1 = defaultdict(int)
for prod in test['MAIN_PRODUCT']:
    d1[prod] += 1
d1

defaultdict(int,
            {'Credit card or prepaid card': 996,
             'Debt collection': 1056,
             'Student loan': 1001,
             'Vehicle loan or lease': 1015,
             'Checking or savings account': 996,
             'Credit reporting, credit repair services, or other personal consumer reports': 959,
             'Mortgage': 977})

In [240]:
d2 = defaultdict(int)
for pred in random_forest.predict(test_vecs):
    d2[pred] += 1
d2

defaultdict(int,
            {'Credit card or prepaid card': 724,
             'Mortgage': 5165,
             'Vehicle loan or lease': 853,
             'Student loan': 109,
             'Debt collection': 39,
             'Credit reporting, credit repair services, or other personal consumer reports': 110})

In [241]:
d3 = defaultdict(int)
for prod in test['SUB_PRODUCT']:
    d3[prod] += 1
d3

defaultdict(int,
            {'General-purpose credit card or charge card': 820,
             'Medical debt': 249,
             'Private student loan': 365,
             'Loan': 1015,
             'Checking account': 996,
             'Federal student loan servicing': 636,
             'Store credit card': 176,
             'Credit reporting': 959,
             'Other debt': 468,
             'Credit card debt': 339,
             'Conventional home mortgage': 730,
             'FHA mortgage': 247})

In [242]:
d4 = defaultdict(int)
for pred in random_forest_2.predict(X_test):
    d4[pred] += 1
d4

defaultdict(int,
            {'FHA mortgage': 628,
             'Store credit card': 91,
             'Loan': 5881,
             'Conventional home mortgage': 351,
             'Credit card debt': 15,
             'General-purpose credit card or charge card': 30,
             'Credit reporting': 2,
             'Private student loan': 2})