In [1]:
import pandas as pd
import numpy as np
import re
import string
import gensim
import spacy
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.metrics import f1_score

In [2]:
# Load the table `compaints_users` for the complaint text and product ID, and the `products` table to find the product and sub-product
df = pd.read_csv('data/complaints_users.csv')
df2 = pd.read_csv('data/products.csv')
df = df.merge(df2, how='left', on='PRODUCT_ID')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Remove product IDs which have fewer than 100 complaints
df['COMPLAINT_COUNTS'] = df.groupby('PRODUCT_ID')['COMPLAINT_ID'].transform('count')
df = df[df['COMPLAINT_COUNTS'] > 100]

In [4]:
df.head()

Unnamed: 0,COMPLAINT_ID,COMPLAINT_TEXT,WAS_USER_DISPUTED,DATE,PRODUCT_ID,ISSUE_ID,MAIN_PRODUCT,SUB_PRODUCT,COMPLAINT_COUNTS
0,3184195,XXXX and Transunion are reporting incorrectly...,,03/19/2019,26,253,"Credit reporting, credit repair services, or o...",Credit reporting,89994
1,3184692,XXXX and Transunion are reporting incorrectly ...,,03/19/2019,26,253,"Credit reporting, credit repair services, or o...",Credit reporting,89994
2,3183613,"XXXX, XXXX, and Experian need to remove the co...",,03/18/2019,26,165,"Credit reporting, credit repair services, or o...",Credit reporting,89994
3,3182430,"3 company with inconsistencies, violations and...",,03/17/2019,26,253,"Credit reporting, credit repair services, or o...",Credit reporting,89994
4,3182218,I have a personal loan from Patriot finance. T...,,03/17/2019,26,165,"Credit reporting, credit repair services, or o...",Credit reporting,89994


In [5]:
train, test = train_test_split(df[['COMPLAINT_TEXT', 'MAIN_PRODUCT', 'SUB_PRODUCT']], test_size=.1)

In [6]:
len(train)

343774

In [7]:
len(test)

38198

In [8]:
train.head()

Unnamed: 0,COMPLAINT_TEXT,MAIN_PRODUCT,SUB_PRODUCT
377388,I have been trying to get a loan Modification ...,Mortgage,Conventional fixed mortgage
145064,I have XXXX hard inquiries on my Transunion cr...,"Credit reporting, credit repair services, or o...",Credit reporting
113361,"As of today, I received my updated credit scor...","Credit reporting, credit repair services, or o...",Credit reporting
307551,I received a email from XXXX account services ...,Debt collection,I do not know
154973,On XXXX XXXX I submitted a request to Transuni...,"Credit reporting, credit repair services, or o...",Credit reporting


In [9]:
train['LENGTH'] = train['COMPLAINT_TEXT'].apply(len)

In [10]:
train_text = train['COMPLAINT_TEXT'].tolist()
train_len = train['LENGTH'].tolist()

In [11]:
nlp = spacy.load('en')
train_text = [' '.join([token.lemma_ for token in nlp(doc)]) for doc in train_text]

In [12]:
train_text[:5]

["-PRON- have be try to get a loan Modification from Caliber Home Loans since XX / XX / XXXX . -PRON- have repeatedly fill out application per there request . -PRON- just send a denial letter state -PRON- be ineligible for any short term loan or interest only loan . -PRON- be suppose to be be evaluate for a repayment plan . now -PRON- have to start all over with an appeal process and -PRON- still wo n't accept -PRON- payment ! ! \n there be discrepancy on the statement -PRON- have n't address . -PRON- say -PRON- accept a loan modification in XX / XX / XXXX - -PRON- have never sign nor see a modification document from -PRON- . -PRON- have payment reversal list which -PRON- have no idea what those be for and -PRON- send { $ 600.00 } to -PRON- ex wife who be n't suppose to be on the loan anymore .",
 '-PRON- have XXXX hard inquiry on -PRON- Transunion credit report that -PRON- do not authorize . first from XXXX on XX / XX / XXXX . the second from XXXX on XX / XX / XXXX . third from XXXX o

In [13]:
train_text = [doc.lower().replace('\n', '',) for doc in train_text]

In [14]:
train_text = [re.sub("[^a-zA-Z$ ]+", '', doc) for doc in train_text]

In [15]:
train_text[:5]

['pron have be try to get a loan modification from caliber home loans since xx  xx  xxxx  pron have repeatedly fill out application per there request  pron just send a denial letter state pron be ineligible for any short term loan or interest only loan  pron be suppose to be be evaluate for a repayment plan  now pron have to start all over with an appeal process and pron still wo nt accept pron payment    there be discrepancy on the statement pron have nt address  pron say pron accept a loan modification in xx  xx  xxxx  pron have never sign nor see a modification document from pron  pron have payment reversal list which pron have no idea what those be for and pron send  $   to pron ex wife who be nt suppose to be on the loan anymore ',
 'pron have xxxx hard inquiry on pron transunion credit report that pron do not authorize  first from xxxx on xx  xx  xxxx  the second from xxxx on xx  xx  xxxx  third from xxxx on xx  xx  xxxx  the fourth be also from xxxx on the date of xx  xx  xxxx  

In [16]:
stop_words = stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count=bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
    
def get_corpus(texts):
    words = list(sent_to_words(texts))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

train_corpus, train_id2word, bigram_train = get_corpus(train_text)

In [17]:
lda_train = gensim.models.ldamulticore.LdaMulticore(
                           corpus=train_corpus,
                           num_topics=20,
                           id2word=train_id2word,
                           chunksize=100,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
lda_train.save('lda_train.model')

In [18]:
lda_train.print_topics(20,num_words=15)[:10]

[(0,
  '0.061*"consumer" + 0.057*"information" + 0.028*"law" + 0.022*"section" + 0.021*"fair" + 0.019*"fcra" + 0.018*"right" + 0.018*"violation" + 0.016*"must" + 0.016*"provide" + 0.016*"require" + 0.015*"act" + 0.015*"reporting_agency" + 0.015*"request" + 0.014*"reporting_act"'),
 (1,
  '0.161*"loan" + 0.039*"payment" + 0.022*"student" + 0.021*"pay" + 0.021*"navient" + 0.015*"year" + 0.015*"make" + 0.014*"would" + 0.013*"interest_rate" + 0.013*"interest" + 0.011*"month" + 0.011*"amount" + 0.010*"time" + 0.009*"apply" + 0.009*"rate"'),
 (2,
  '0.100*"inquiry" + 0.067*"remove" + 0.032*"company" + 0.031*"authorize" + 0.027*"unauthorized" + 0.026*"without" + 0.021*"pull" + 0.018*"inquire" + 0.018*"contact" + 0.017*"hard_inquiry" + 0.014*"please" + 0.014*"request" + 0.013*"transunion" + 0.013*"apply" + 0.012*"permission"'),
 (3,
  '0.056*"bankruptcy" + 0.056*"court" + 0.055*"file" + 0.034*"attorney" + 0.029*"state" + 0.027*"case" + 0.017*"law" + 0.014*"chapter" + 0.013*"discharge" + 0.013*

In [19]:
train_vecs = []
for i in range(len(train_text)):
    top_topics = lda_train.get_document_topics(train_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(20)]
    topic_vec.append(train_len[i])  # length of complaint
    train_vecs.append(topic_vec)

In [20]:
X = np.array(train_vecs)
y = np.array(train['MAIN_PRODUCT'])

kf = KFold(5, shuffle=True, random_state=42)
cv_lr_f1, cv_lrsgd_f1, cv_svcsgd_f1,  = [], [], []

for train_ind, val_ind in kf.split(X, y):
    # Assign CV IDX
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True,
        multi_class='auto'
    ).fit(X_train_scale, y_train)

    y_pred = lr.predict(X_val_scale)
    cv_lr_f1.append(f1_score(y_val, y_pred, average=None))
    
    # Logistic Regression SGD
    sgd = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        loss='log',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd.predict(X_val_scale)
    cv_lrsgd_f1.append(f1_score(y_val, y_pred, average=None))
    
    # SGD Modified Huber
    sgd_huber = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        alpha=20,
        loss='modified_huber',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd_huber.predict(X_val_scale)
    cv_svcsgd_f1.append(f1_score(y_val, y_pred, average=None))

print(f'Logistic Regression Val f1: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}')
print(f'Logisitic Regression SGD Val f1: {np.mean(cv_lrsgd_f1):.3f} +- {np.std(cv_lrsgd_f1):.3f}')
print(f'SVM Huber Val f1: {np.mean(cv_svcsgd_f1):.3f} +- {np.std(cv_svcsgd_f1):.3f}')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Logistic Regression Val f1: 0.347 +- 0.228
Logisitic Regression SGD Val f1: 0.346 +- 0.250
SVM Huber Val f1: 0.053 +- 0.126


  'precision', 'predicted', average, warn_for)
