In [1]:
import pandas as pd
import numpy as np
import re
import string
import gensim
import spacy
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

In [28]:
# Load the table `compaints_users` for the complaint text and product ID, and the `products` table to find the product and sub-product
df = pd.read_csv('data/complaints_users.csv')
df2 = pd.read_csv('data/products.csv')
df = df.merge(df2, how='left', on='PRODUCT_ID')

In [29]:
# Change sub-product of "I do not know" to null
df.loc[df['SUB_PRODUCT'] == 'I do not know', 'SUB_PRODUCT'] = np.nan

In [30]:
# Remove product IDs for which there have been no recent complaints
df['date_norm'] = df['DATE'].apply(lambda x: int(x[6:10] + x[0:2] + x[3:5]))
df = df[df['date_norm'] > 20180000]

In [31]:
# Remove complaints which have null values in either main or sub products
df = df[(df['MAIN_PRODUCT'].notnull()) & (df['SUB_PRODUCT'].notnull())]

In [32]:
# Remove product IDs which have fewer than 100 complaints
df['COMPLAINT_COUNTS'] = df.groupby('SUB_PRODUCT')['COMPLAINT_ID'].transform('count')
df = df[df['COMPLAINT_COUNTS'] > 2000]

In [33]:
len(df)

112819

In [34]:
def rebalance(df, column, sample_size):
    new_df = pd.DataFrame(columns=df.columns)
    for group in df[column].unique():
        sample = sample_size
        data = len(df[df[column] == group])
        if data <= sample:
            new_df = new_df.append(df[df[column] == group])
            sample -= data
        new_df = new_df.append(resample(df[df[column] == group], n_samples=sample))
    return new_df.sample(frac=1).reset_index(drop=True)

In [35]:
# Bootstrap / downsample all classes to have 1000 data points
df = rebalance(df, 'MAIN_PRODUCT', 5000)

In [36]:
len(df)

35000

In [37]:
for main_product in df['MAIN_PRODUCT'].unique():
    print(main_product)
    print(len(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique()))
#     print(df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique())
    for sub_product in df[df['MAIN_PRODUCT'] == main_product]['SUB_PRODUCT'].unique():
        print(len(df[(df['MAIN_PRODUCT'] == main_product) & (df['SUB_PRODUCT'] == sub_product)]), sub_product)
    print()

Debt collection
3
1696 Credit card debt
2043 Other debt
1261 Medical debt

Vehicle loan or lease
1
5000 Loan

Credit card or prepaid card
2
4030 General-purpose credit card or charge card
970 Store credit card

Checking or savings account
1
5000 Checking account

Student loan
2
1746 Private student loan
3254 Federal student loan servicing

Credit reporting, credit repair services, or other personal consumer reports
1
5000 Credit reporting

Mortgage
2
3763 Conventional home mortgage
1237 FHA mortgage



In [38]:
train, test = train_test_split(df[['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT']], test_size=.1)

In [39]:
len(train)

31500

In [40]:
len(test)

3500

In [41]:
train.head()

Unnamed: 0,COMPLAINT_TEXT,MAIN_PRODUCT,SUB_PRODUCT
32001,On XX/XX/XXXX my vehicle was repossessed from ...,Vehicle loan or lease,Loan
1839,I contacted XXXX XXXX XXXX to have my extended...,Vehicle loan or lease,Loan
25494,I was calling interested XXXX phone payment s...,Credit card or prepaid card,Store credit card
34770,My name is XXXX XXXX. In 2017 my loan funding ...,Student loan,Federal student loan servicing
23002,Several months ago we were approved for a loan...,Mortgage,Conventional home mortgage


In [42]:
train['LENGTH'] = train['COMPLAINT_TEXT'].apply(len)

In [43]:
train_text = train['COMPLAINT_TEXT'].tolist()
train_len = train['LENGTH'].tolist()

In [44]:
nlp = spacy.load('en')
train_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in train_text]

In [45]:
train_text = [doc.lower().replace(r'\n', '',) for doc in train_text]

In [46]:
train_text = [re.sub(r"[^a-zA-Z$ -]+", '', doc) for doc in train_text]

In [47]:
stop_words = stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count=bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
    
def get_corpus(texts):
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

In [48]:
train_corpus, train_id2word, bigram_train = get_corpus(train_text)

In [49]:
number_of_topics = 20

lda_train = gensim.models.ldamulticore.LdaMulticore(
                           corpus=train_corpus,
                           num_topics=number_of_topics,
                           id2word=train_id2word,
                           chunksize=100,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
lda_train.save('lda_train.model')

In [50]:
lda_train.print_topics(number_of_topics, num_words=15)[:10]

[(0,
  '0.017*"property" + 0.015*"law" + 0.013*"act" + 0.012*"case" + 0.012*"note" + 0.011*"court" + 0.009*"criminal" + 0.009*"document" + 0.008*"lie" + 0.008*"legal" + 0.008*"record" + 0.008*"never" + 0.008*"illegal" + 0.007*"state" + 0.007*"fee"'),
 (1,
  '0.103*"loan" + 0.029*"navient" + 0.020*"student_loan" + 0.019*"year" + 0.012*"forbearance" + 0.011*"month" + 0.011*"program" + 0.011*"school" + 0.011*"amount" + 0.009*"income" + 0.008*"monthly" + 0.008*"interest" + 0.008*"apply" + 0.008*"qualify" + 0.008*"application"'),
 (2,
  '0.016*"financial" + 0.012*"damage" + 0.012*"agreement" + 0.012*"wife" + 0.010*"continue" + 0.009*"cause" + 0.008*"lease" + 0.007*"order" + 0.007*"attempt" + 0.006*"fay" + 0.006*"key" + 0.006*"action" + 0.006*"family" + 0.006*"father" + 0.006*"accept"'),
 (3,
  '0.079*"car" + 0.043*"loan" + 0.040*"vehicle" + 0.018*"finance" + 0.018*"purchase" + 0.012*"dealership" + 0.011*"title" + 0.010*"contract" + 0.010*"auto" + 0.009*"santander" + 0.009*"take" + 0.009*"de

In [51]:
train_vecs = []
for i in range(len(train_text)):
    top_topics = lda_train.get_document_topics(train_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(number_of_topics)]
    topic_vec.append(train_len[i])  # length of complaint
    train_vecs.append(topic_vec)

# Grid search logistic regression hyper-parameters

In [52]:
X_train = np.array(train_vecs)
y_train = np.array(train['MAIN_PRODUCT'])

# Scale Data
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)

cv_gridsearch_f1 = []
parameters = {'penalty': ['l1', 'l2'],
              'C': [.001, .01, .1, 1, 10, 100],
              'class_weight': ['balanced', None]
             }
clf = GridSearchCV(LogisticRegression(solver='saga', multi_class='auto', max_iter=10000, n_jobs=-1), parameters, cv=10)
clf.fit(X_train_scale, y_train)
print(clf.best_params_)

{'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}


## Send test data through final main-product model

In [53]:
X_train = np.array(train_vecs)
y_train = np.array(train['MAIN_PRODUCT'])

# Scale Data
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)

logistic_regression = LogisticRegression(
    penalty='l1',
    C=1,
    solver='saga',
    multi_class='auto',
    max_iter=10000,
    class_weight='balanced',
    n_jobs=-1
).fit(X_train_scale, y_train)

In [54]:
test['LENGTH'] = test['COMPLAINT_TEXT'].apply(len)
test_text = test['COMPLAINT_TEXT'].tolist()
test_len = test['LENGTH'].tolist()
test_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in test_text]
test_text = [doc.lower().replace(r'\n', '',) for doc in test_text]
test_text = [re.sub(r"[^a-zA-Z$ -]+", '', doc) for doc in test_text]

In [55]:
def get_bigram(texts):
    """
    For the test data we only need the bigram data built on train data,
    as we'll use the train id2word mappings. This is a requirement due to 
    the shapes Gensim functions expect in the test-vector transformation below.
    With both these in hand, we can make the test corpus.
    """
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram = bigrams(words)
    bigram = [bigram[review] for review in words]
    return bigram
  
bigram_test = get_bigram(test_text)

test_corpus = [train_id2word.doc2bow(text) for text in bigram_test]

test_vecs = []
for i in range(len(test_text)):
    top_topics = lda_train.get_document_topics(test_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(number_of_topics)]
    topic_vec.append(test_len[i])
    test_vecs.append(topic_vec)

In [56]:
X_test = np.array(test_vecs)
y_test = np.array(test['MAIN_PRODUCT'])

X_test_scale = scaler.transform(X_test)

In [57]:
y_pred = logistic_regression.predict(X_test_scale)
lr_f1_test = f1_score(y_test, y_pred, average=None)
print(f'Logistic Regression Test f1: {np.mean(lr_f1_test):.3f} +/- {np.std(lr_f1_test):.3f}')

Logistic Regression Test f1: 0.789 +/- 0.078


## Send test data through final sub-product model

In [58]:
X_train = np.array(train_vecs)
y_train = np.array(train['SUB_PRODUCT'])

main_product = train['MAIN_PRODUCT']
labels = LabelEncoder()
z = labels.fit_transform(main_product)
X_train = np.concatenate((X_train, z.reshape(len(z), 1)), axis=1)

# Scale Data
scaler_2 = StandardScaler()
X_train_scale = scaler_2.fit_transform(X_train)

logistic_regression_2 = LogisticRegression(
    penalty='l1',
    C=1,
    solver='saga',
    multi_class='auto',
    max_iter=10000,
    class_weight='balanced',
    n_jobs=-1
).fit(X_train_scale, y_train)

In [66]:
X_test = np.array(test_vecs)
y_test = np.array(test['SUB_PRODUCT'])

main_product = logistic_regression.predict(X_test)
main_product = test['MAIN_PRODUCT']
z = labels.transform(main_product)
X_test = np.concatenate((X_test, z.reshape(len(z), 1)), axis=1)

X_test_scale = scaler_2.transform(X_test)

y_pred = logistic_regression_2.predict(X_test_scale)
lr_f1_test = f1_score(y_test, y_pred, average=None)
print(f'Logistic Regression Test f1: {np.mean(lr_f1_test):.3f} +/- {np.std(lr_f1_test):.3f}')

Logistic Regression Test f1: 0.646 +/- 0.229


In [60]:
from collections import defaultdict

In [61]:
d1 = defaultdict(int)
for prod in test['MAIN_PRODUCT']:
    d1[prod] += 1
d1

defaultdict(int,
            {'Credit card or prepaid card': 481,
             'Student loan': 515,
             'Credit reporting, credit repair services, or other personal consumer reports': 535,
             'Vehicle loan or lease': 504,
             'Checking or savings account': 516,
             'Mortgage': 487,
             'Debt collection': 462})

In [62]:
d2 = defaultdict(int)
for pred in logistic_regression.predict(test_vecs):
    d2[pred] += 1
d2

defaultdict(int, {'Mortgage': 3494, 'Vehicle loan or lease': 6})

In [63]:
d3 = defaultdict(int)
for prod in test['SUB_PRODUCT']:
    d3[prod] += 1
d3

defaultdict(int,
            {'General-purpose credit card or charge card': 377,
             'Private student loan': 185,
             'Credit reporting': 535,
             'Loan': 504,
             'Checking account': 516,
             'Conventional home mortgage': 373,
             'Federal student loan servicing': 330,
             'Other debt': 191,
             'FHA mortgage': 114,
             'Medical debt': 138,
             'Store credit card': 104,
             'Credit card debt': 133})

In [64]:
d4 = defaultdict(int)
for pred in logistic_regression_2.predict(X_test):
    d4[pred] += 1
d4

defaultdict(int,
            {'FHA mortgage': 1438,
             'Loan': 1866,
             'Store credit card': 182,
             'Private student loan': 5,
             'Other debt': 9})