In [1]:
import pandas as pd
import numpy as np
import re
import string
import gensim
import spacy
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import multilabel_confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA



In [2]:
# Load the table `compaints_users` for the complaint text and product ID, and the `products` table to find the product and sub-product
df = pd.read_csv('data/complaints_users.csv')
df2 = pd.read_csv('data/products.csv')
df = df.merge(df2, how='left', on='PRODUCT_ID')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Change sub-product of "I do not know" to null
df.loc[df['SUB_PRODUCT'] == 'I do not know', 'SUB_PRODUCT'] = np.nan

In [4]:
# Remove product IDs for which there have been no recent complaints
df['date_norm'] = df['DATE'].apply(lambda x: int(x[6:10] + x[0:2] + x[3:5]))
df = df[df['date_norm'] > 20180000]

In [5]:
# Remove complaints which have null values in either main or sub products
df = df[(df['MAIN_PRODUCT'].notnull()) & (df['SUB_PRODUCT'].notnull())]

In [6]:
# Remove product IDs which have fewer than 1000 complaints
df['COMPLAINT_COUNTS'] = df.groupby('SUB_PRODUCT')['COMPLAINT_ID'].transform('count')
df = df[df['COMPLAINT_COUNTS'] > 1000]

In [7]:
print(len(df), 'total rows')
print(len(df[df['COMPLAINT_COUNTS'] > 1000]['MAIN_PRODUCT'].unique()), 'unique main-products')
print(len(df[df['COMPLAINT_COUNTS'] > 1000]['SUB_PRODUCT'].unique()), 'unique sub-products')

118302 total rows
9 unique main-products
17 unique sub-products


In [8]:
for main_product in df['MAIN_PRODUCT'].unique():
    print(main_product)
#     print(len(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique()))
#     print(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique())
    for sub_product in df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique():
        print('   ', len(df[(df['MAIN_PRODUCT'] == main_product) & (df['SUB_PRODUCT'] == sub_product)]), sub_product)
    print()

Credit reporting, credit repair services, or other personal consumer reports
    54735 Credit reporting
    1030 Other personal consumer report
    1 Conventional home mortgage

Debt collection
    4964 Medical debt
    8401 Other debt
    6799 Credit card debt
    1123 Payday loan debt
    1095 Auto debt

Student loan
    3956 Federal student loan servicing
    2197 Private student loan

Credit card or prepaid card
    2403 Store credit card
    10359 General-purpose credit card or charge card

Mortgage
    6947 Conventional home mortgage
    2288 FHA mortgage

Checking or savings account
    6745 Checking account

Money transfer, virtual currency, or money service
    1129 Domestic (US) money transfer

Vehicle loan or lease
    3024 Loan

Payday loan, title loan, or personal loan
    1106 Installment loan



In [9]:
# Remove that possibly mis-classified "Convential home mortgage" in "Credit reporting..." main-product
df = (df[((df['MAIN_PRODUCT'] != 'Credit reporting, credit repair services, or other personal consumer reports') |
     (df['SUB_PRODUCT'] != 'Conventional home mortgage'))])

In [10]:
len(df)

118301

In [11]:
for main_product in df['MAIN_PRODUCT'].unique():
    print(main_product)
#     print(len(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique()))
#     print(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique())
    for sub_product in df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique():
        print('   ', len(df[(df['MAIN_PRODUCT'] == main_product) & (df['SUB_PRODUCT'] == sub_product)]), sub_product)
    print()

Credit reporting, credit repair services, or other personal consumer reports
    54735 Credit reporting
    1030 Other personal consumer report

Debt collection
    4964 Medical debt
    8401 Other debt
    6799 Credit card debt
    1123 Payday loan debt
    1095 Auto debt

Student loan
    3956 Federal student loan servicing
    2197 Private student loan

Credit card or prepaid card
    2403 Store credit card
    10359 General-purpose credit card or charge card

Mortgage
    6947 Conventional home mortgage
    2288 FHA mortgage

Checking or savings account
    6745 Checking account

Money transfer, virtual currency, or money service
    1129 Domestic (US) money transfer

Vehicle loan or lease
    3024 Loan

Payday loan, title loan, or personal loan
    1106 Installment loan



In [12]:
train, test = train_test_split(df[['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT']], test_size=.1)

In [13]:
len(train)

106470

In [14]:
len(test)

11831

In [15]:
train.head()

Unnamed: 0,COMPLAINT_TEXT,MAIN_PRODUCT,SUB_PRODUCT
49582,I have tried on several occasions to dispute a...,Debt collection,Credit card debt
40517,This complaint is in regards to overdraft fees...,Checking or savings account,Checking account
25060,My old car was repossessed after I became unem...,"Credit reporting, credit repair services, or o...",Credit reporting
14067,someone else used my information to obtained c...,"Credit reporting, credit repair services, or o...",Credit reporting
18491,Back in XX/XX/XXXX per Ace Cash Express I took...,"Payday loan, title loan, or personal loan",Installment loan


In [16]:
train_text = train['COMPLAINT_TEXT'].tolist()

In [17]:
nlp = spacy.load('en')
train_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in train_text]

In [18]:
train_text = [doc.lower().replace(r'\n', '',) for doc in train_text]

In [19]:
train_text = [re.sub(r"[^a-zA-Z$ -]+", '', doc) for doc in train_text]

In [20]:
stop_words = stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count=bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
    
def get_corpus(texts):
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

In [21]:
train_corpus, train_id2word, bigram_train = get_corpus(train_text)

In [22]:
number_of_topics = 30

lda_train = gensim.models.ldamulticore.LdaMulticore(
                           corpus=train_corpus,
                           num_topics=number_of_topics,
                           id2word=train_id2word,
                           chunksize=100,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
lda_train.save('lda_train.model')

In [23]:
lda_train.print_topics(number_of_topics, num_words=15)[:10]

[(0,
  '0.115*"number" + 0.109*"address" + 0.099*"name" + 0.049*"information" + 0.028*"phone" + 0.024*"social_security" + 0.023*"use" + 0.020*"never" + 0.019*"live" + 0.017*"personal" + 0.012*"contact" + 0.012*"list" + 0.012*"person" + 0.010*"paypal" + 0.010*"also"'),
 (1,
  '0.198*"card" + 0.058*"chase" + 0.039*"close" + 0.026*"open" + 0.023*"offer" + 0.022*"use" + 0.017*"apply" + 0.017*"limit" + 0.016*"new" + 0.015*"receive" + 0.014*"american_express" + 0.012*"purchase" + 0.012*"point" + 0.011*"balance" + 0.010*"month"'),
 (2,
  '0.064*"fee" + 0.053*"charge" + 0.050*"balance" + 0.044*"pay" + 0.036*"amount" + 0.035*"interest" + 0.030*"payment" + 0.024*"statement" + 0.023*"month" + 0.014*"due" + 0.011*"monthly" + 0.011*"make" + 0.011*"apply" + 0.010*"total" + 0.010*"would"'),
 (3,
  '0.103*"insurance" + 0.062*"company" + 0.060*"claim" + 0.027*"policy" + 0.026*"services" + 0.018*"provide" + 0.018*"florida" + 0.018*"send" + 0.017*"request" + 0.017*"pay" + 0.016*"office" + 0.014*"cover" +

In [24]:
tfidf_text_vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             lowercase=True,
                             norm='l2',
                             max_df=.9,
                             min_df=.1)
tfidf_text = tfidf_text_vectorizer.fit_transform(train['COMPLAINT_TEXT'])
text_cols = tfidf_text_vectorizer.get_feature_names()
tfidf_text = pd.DataFrame(tfidf_text.todense(),
                          columns=[text_cols])

#### Build a model with TF-IDF and Cosine Similarity in order to establish a baseline

In [25]:
top_topics = [[entry[1] for entry in lda_train.get_document_topics(train_corpus[i], minimum_probability=0.0)] for i in range(len(train_text))]
train_vecs = pd.DataFrame(columns=[f'topic_{i}' for i in range(number_of_topics)], data=top_topics)

In [26]:
train_df = pd.concat([train.reset_index(drop=True), train_vecs, tfidf_text], axis=1)

In [27]:
train_df.head()

Unnamed: 0,COMPLAINT_TEXT,MAIN_PRODUCT,SUB_PRODUCT,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,"(xx xx,)","(xx xxxx,)","(xxxx,)","(xxxx and,)","(xxxx the,)","(xxxx to,)","(xxxx xxxx,)","(years,)","(you,)","(your,)"
0,I have tried on several occasions to dispute a...,Debt collection,Credit card debt,0.000306,0.000306,0.000306,0.000306,0.008327,0.206162,0.000306,...,0.135618,0.035166,0.160793,0.0,0.0,0.0,0.051132,0.0,0.043348,0.0
1,This complaint is in regards to overdraft fees...,Checking or savings account,Checking account,0.000333,0.000333,0.334402,0.000333,0.328978,0.000333,0.000333,...,0.0,0.0,0.071116,0.0,0.0,0.0,0.030153,0.0,0.0,0.0
2,My old car was repossessed after I became unem...,"Credit reporting, credit repair services, or o...",Credit reporting,0.001236,0.001236,0.001236,0.001236,0.001236,0.001236,0.001236,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,someone else used my information to obtained c...,"Credit reporting, credit repair services, or o...",Credit reporting,0.806661,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Back in XX/XX/XXXX per Ace Cash Express I took...,"Payday loan, title loan, or personal loan",Installment loan,0.035917,0.00063,0.00063,0.00063,0.00063,0.00063,0.00063,...,0.14289,0.185259,0.105884,0.06132,0.0,0.0,0.0,0.089206,0.0,0.0


## Send test data through final main-product model

In [28]:
# Apply SMOTE to all classes to have the same number of data points
smt = SMOTE()
X_train, y_train = smt.fit_sample(train_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1), train_df['MAIN_PRODUCT'])

# Scale Data
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)

# Apply PCA
pca = PCA(n_components=40)
X_train_scale = pca.fit_transform(X_train_scale)

random_forest = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=4,
    class_weight='balanced',
    n_jobs=-1
).fit(X_train_scale, y_train)

In [29]:
test_text = test['COMPLAINT_TEXT'].tolist()
test_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in test_text]
test_text = [doc.lower().replace(r'\n', '',) for doc in test_text]
test_text = [re.sub(r"[^a-zA-Z$ -]+", '', doc) for doc in test_text]

In [30]:
tfidf_text_test = tfidf_text_vectorizer.transform(test['COMPLAINT_TEXT'])
tfidf_text_test = pd.DataFrame(tfidf_text_test.todense(),
                               columns=[text_cols])

In [31]:
def get_bigram(texts):
    """
    For the test data we only need the bigram data built on train data,
    as we'll use the train id2word mappings. This is a requirement due to 
    the shapes Gensim functions expect in the test-vector transformation below.
    With both these in hand, we can make the test corpus.
    """
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram = bigrams(words)
    bigram = [bigram[review] for review in words]
    return bigram
  
bigram_test = get_bigram(test_text)

test_corpus = [train_id2word.doc2bow(text) for text in bigram_test]

top_topics = [[entry[1] for entry in lda_train.get_document_topics(test_corpus[i], minimum_probability=0.0)] for i in range(len(test_text))]
test_vecs = pd.DataFrame(columns=[f'topic_{i}' for i in range(number_of_topics)], data=top_topics)

test_df = pd.concat([test.reset_index(drop=True), test_vecs, tfidf_text_test], axis=1)

In [32]:
X_test = np.array(test_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))
y_test = np.array(test_df['MAIN_PRODUCT'])

X_test_scale = scaler.transform(X_test)
X_test_scale = pca.transform(X_test_scale)

In [33]:
y_pred = random_forest.predict(X_test_scale)
y_true = y_test

print('Accuracy Score:', accuracy_score(y_true, y_pred))
print('Precision Score:', precision_score(y_true, y_pred, average='macro'))
print('Recall Score:', recall_score(y_true, y_pred, average='macro'))
print('F1 Score:', f1_score(y_true, y_pred, average='macro'))
print()
conf_mat = multilabel_confusion_matrix(y_true, y_pred).ravel()
np.array([[sum([item[0][0] for item in conf_mat]),
           sum([item[0][1] for item in conf_mat])],
          [sum([item[1][0] for item in conf_mat]),
           sum([item[1][1] for item in conf_mat])]])
print('True Negative:', tn)
print('False Positive:', fp)
print('False Negative:', fn)
print('True Positive:', tp)

Accuracy Score: 0.7706026540444595
Precision Score: 0.5900168731911648
Recall Score: 0.5874451146753336
F1 Score: 0.5811180389068358



ValueError: too many values to unpack (expected 4)

In [39]:
multilabel_confusion_matrix(y_true, y_pred)

array([[[10846,   305],
        [  145,   535]],

       [[10117,   385],
        [  395,   934]],

       [[ 5705,   569],
        [  779,  4778]],

       [[ 9073,   521],
        [  568,  1669]],

       [[11704,    33],
        [   56,    38]],

       [[10613,   315],
        [  266,   637]],

       [[11699,    25],
        [   99,     8]],

       [[10903,   321],
        [  191,   416]],

       [[11274,   240],
        [  215,   102]]])

In [48]:
np.array([[sum([item[0][0] for item in multilabel_confusion_matrix(y_true, y_pred)]),
           sum([item[0][1] for item in multilabel_confusion_matrix(y_true, y_pred)])],
          [sum([item[1][0] for item in multilabel_confusion_matrix(y_true, y_pred)]),
           sum([item[1][1] for item in multilabel_confusion_matrix(y_true, y_pred)])]])

array([[91934,  2714],
       [ 2714,  9117]])

In [None]:
# feat = random_forest.feature_importances_
# for i in range(number_of_topics):
#     print('Topic', i, feat[i])

## Send test data through final sub-product model

In [None]:
X_train = train_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1)
y_train = train_df['SUB_PRODUCT']

main_product = train_df['MAIN_PRODUCT']
labels = LabelEncoder()
z = labels.fit_transform(main_product)

X_train = np.concatenate((X_train, z.reshape(len(z), 1)), axis=1)

# Apply SMOTE to all classes to have the same number of data points
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)

# Scale Data
scaler_2 = StandardScaler()
X_train_scale = scaler_2.fit_transform(X_train)

# Apply PCA
pca_2 = PCA(n_components=40)
X_train_scale = pca_2.fit_transform(X_train_scale)

random_forest_2 = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=4,
    class_weight='balanced',
    n_jobs=-1
).fit(X_train_scale, y_train)



X_test = np.array(test_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))
y_test = np.array(test_df['SUB_PRODUCT'])

main_product = random_forest.predict(X_test)
z = labels.transform(main_product)
X_test = np.concatenate((X_test, z.reshape(len(z), 1)), axis=1)

X_test_scale = scaler_2.transform(X_test)
X_test_scale = pca_2.transform(X_test_scale)

y_pred = random_forest_2.predict(X_test_scale)
y_true = y_test

print('Accuracy Score:', accuracy_score(y_true, y_pred))
print('Precision Score:', precision_score(y_true, y_pred))
print('Recall Score:', recall_score(y_true, y_pred))
print('F1 Score:', f1_score(y_true, y_pred))
print()
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print('True Negative:', tn)
print('False Positive:', fp)
print('False Negative:', fn)
print('True Positive:', tp)

In [None]:
feat = random_forest_2.feature_importances_
for i in range(number_of_topics):
    print('Topic', i, feat[i])
print('Main-product prediction', feat[-1])

In [None]:
from collections import defaultdict

In [None]:
d1 = defaultdict(int)
for prod in test['MAIN_PRODUCT']:
    d1[prod] += 1
d1

In [None]:
d2 = defaultdict(int)
for pred in random_forest.predict(np.array(test_df.drop(['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT'], axis=1))):
    d2[pred] += 1
d2

In [None]:
d3 = defaultdict(int)
for prod in test['SUB_PRODUCT']:
    d3[prod] += 1
d3

In [None]:
d4 = defaultdict(int)
for pred in random_forest_2.predict(X_test):
    d4[pred] += 1
d4