# Training Models

This file will train models to predict whether a complaint is closed or closed with some sort of relief. The features used to predict this include details about the complaint (i.e., the product) and features derived from the narrative of the complaint. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import nltk
from scipy import sparse
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import metrics
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import (learning_curve, StratifiedShuffleSplit, cross_val_score, ShuffleSplit,
                                     cross_val_predict, RandomizedSearchCV)


Next, need to get the data that is housed in PostgreSQL database.

In [None]:
#set-up access to database
db_name = 'complaint1'
username = 'postgres'
host = 'localhost'
port = '5432' 
#password = 'pw'

con = psycopg2.connect(database=db_name, 
    host='localhost',
    user=username,
    password=password)

sql_query = """
SELECT * FROM complaint1;
"""
complaints_df = pd.read_sql_query(sql_query,con)

In [None]:
complaints_df.head()

In [None]:
complaints_df.shape

In [None]:
#the narratives might have missing values after pre-processing, so we'll remove any that are empty now
complaints_df=complaints_df.dropna(subset = ['narrative'])
complaints_df.shape

Everything looks good. We had one complaint that was empty after the text pre-processing, so it was removed from consideration. We will now move forward with data preparation.

We now need to get only some features out of the complaints_df that are relevant to our prediction task. We'll start by obtaining information about the complaint, the narrative submited by the consumer and meta-features about the complaints.

In [None]:
meta_feat=['sentiment','ADJ','ADP','ADV','CCONJ','DET','INTJ','NOUN','NUM','PART','PRON',
          'PROPN','PUNCT','SPACE','SYM','VERB','X','avg_words_sent','num_sent','num_word']

#select these features from the full data set
X = complaints_df[meta_feat]

In [None]:
#one last time, any data that is missing, get rid of it
X_cleaned = X[~X.isnull().all(axis=1)]

#complete fill-in so that scikit learn will work
X_cleaned = X_cleaned.fillna(0)

#standardize feature for models later
scaler = StandardScaler()
X_std = scaler.fit_transform(X_cleaned)

#save the scaler so it can be used in the Flask App later
joblib.dump(scaler, 'trained_scaler.pkl')

A future consideration is to also include state and product information in the predictive models. We'll save this information now so we can come back to it later.

In [None]:
#get other descriptors about the complaints
desc_feat=['prod','state']

X2 = complaints_df[desc_feat]

#delete rows with missing data
X2_cleaned = X2[~X2.isnull().all(axis=1)]

#fill remaining missing values with zero
X2_cleaned = X2_cleaned.fillna(0)

In [None]:
#perform one-hot encoding (or dummy coding) for these features
#first, perform one-hot encoding for the column prod
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

prod_vals=array(X2_cleaned['prod'])
label_encoder = LabelEncoder()
integer_encoded_prod = label_encoder.fit_transform(prod_vals)

onehot_encoder = OneHotEncoder(sparse=True)
integer_encoded_prod = integer_encoded_prod.reshape(len(integer_encoded_prod), 1)
onehot_encoded_prod = onehot_encoder.fit_transform(integer_encoded_prod)

onehot_encoded_prod.shape

In [None]:
#now do the same for states
X2_cleaned['state'] = X2_cleaned['state'].astype(str)
state_vals=array(X2_cleaned['state'])
label_encoder = LabelEncoder()
integer_encoded_state = label_encoder.fit_transform(state_vals)

integer_encoded_state = integer_encoded_state.reshape(len(integer_encoded_state), 1)
onehot_encoded_state = onehot_encoder.fit_transform(integer_encoded_state)

Now we need to generate the bag-of-words matrix that will be used in concert with the meta-features to train the classifiers. We'll use unigrams and bigrams with the tf-idf transformation to sort out important, rare terms from those that are not predictive. 

In [None]:
#now, get the narratives out so that we can generate a bag-of-words representation for these texts
#narratives=complaints_df['narrative']

#generate matrix
#vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=250)
#X_ngrams = vectorizer.fit_transform(narratives)
#joblib.dump(X_ngrams, 'x_ngrams.pkl')
#save vectorizer so it can be used in later cases
#joblib.dump(vectorizer, 'tfidf_unibi_250.pkl')

In [None]:
#save matrix to save time later
#sparse.save_npz("ngrams.npz", X_ngrams)
X_ngrams = sparse.load_npz("ngrams.npz")

Now, need to merge everything together: the meta-features, the descriptions of the complaint, and the n-gram features.

In [None]:
X_std_sparse = sparse.csr_matrix(X_std)
#X2_sparse0= sparse.csr_matrix(X2_cleaned)
#X2_sparse0=sparse.csr_matrix(X2_cleaned)
X_full = sparse.hstack([X_std_sparse, onehot_encoded_prod, onehot_encoded_state, X_ngrams])
X_nongrams = sparse.hstack([X_std_sparse, onehot_encoded_prod, onehot_encoded_state])
X_full2 = sparse.hstack([X_std_sparse, X_ngrams])

In [None]:
X_full.shape

One last step, after getting the feature matrix merged, is to encode the response variable to be used in later models. We'll do that next.

In [None]:
y=complaints_df['response']
#y['response'].value_counts()

In [None]:
#encode the target variable
le = LabelEncoder()
y_enc = le.fit_transform(y.values.ravel())

# Building Models

We'll first explore models using only the meta-features: descriptions of the complaints, the product, and the state of the person filing the complaint. We'll use stochastic gradient descent because the data are quite sparse (especially when we use the meta-features in concert with the n-grams). 

To gauge performance of the classifiers, we will use 10-fold cross validation and focus on precision (i.e., limiting the proportion of false positives identified by the classifier). Also, the data are quite imblanaced. Most people do not receive relief when filing their complaints, therefore, we'll also perform stratified sampling to divide the classes.

In [None]:
#recommended number of iterations for SGD from scikit learn documentation
SGD_iterations = np.ceil(10 ** 6 / len(X_std))
SGD_iterations

In [None]:
# Use SGD with log loss (i.e., logistic regression) and the elastic net penalty (in case some predictors are correlated)
#set the max iterations equal to the recommended iterations from scikit learn documentation, set random state to 129
#use stratified shuffle split to stratify the data based on the response variable due to imbalance
#score based on precision and use all processing cores 
scores = cross_val_score(estimator=SGDClassifier(loss='log', penalty='elasticnet', max_iter=SGD_iterations, 
                                                 random_state=28),
    X=X_nongrams,y=y_enc,cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=28),
    scoring='precision',n_jobs=1)

#average and standard deviation of precision across folds
print('Precision: {} +/- {}'.format(scores.mean(), scores.std()))

Now, look at the confusion matrix to see what sort of errors the classifier is making. 

In [None]:
# perform prediction to get the confusion matrix 
y_pred = cross_val_predict(estimator=SGDClassifier(loss='log', penalty='elasticnet', max_iter=SGD_iterations,
        random_state=28),
    X=X_nongrams, y=y_enc, cv=10, n_jobs=-1)

#get confusion matrix
cm = metrics.confusion_matrix(y_enc, y_pred)
print(cm)

Overall, looks like the classifier performance is pretty weak with respect to precision. There are very few true positives, and a lot of false negatives (not surprising given the significant imbalance). On the other hand, there is a non-trivial amount of false positives (1627), so we will explore if the n-grams improve performance. 

In [None]:
#same thing as before, just with more features
scores_full = cross_val_score(estimator=SGDClassifier(loss='log', penalty='elasticnet', max_iter=SGD_iterations, 
                                                 random_state=28),
    X=X_full2,y=y_enc,cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=41),
    scoring='precision',n_jobs=1)

#average and standard deviation of precision across folds
print('Precision: {} +/- {}'.format(scores_full.mean(), scores_full.std()))

In [None]:
scores_full

As before, let's have a look at the confusion matrix to see where the classifier is making errors. 

In [None]:
#perform prediction to get the confusion matrix 
y_pred = cross_val_predict(estimator=SGDClassifier(loss='log', penalty='elasticnet', max_iter=SGD_iterations,
        random_state=41),
    X=X_full2, y=y_enc, cv=10, n_jobs=1)

#examine confusion matrix
cm = metrics.confusion_matrix(y_enc, y_pred)
print(cm)

The confusion matrix here shows better performance than when we only used the meta-features. We see here that 2060 are correctly identified as receiving relief, but 61086 are incorrectly identified as false negatives. SVMs have been shown to work well for text classification, so let's try to see how an SVM with linear kernel performs. Due to the rather significant increase in performance when using the n-grams in the elastic net model, we'll move forward with those.

In [None]:
#same thing as before, just with more features
scores_full_svm = cross_val_score(estimator=SGDClassifier(loss='hinge', penalty='l2', max_iter=SGD_iterations, 
                                                 random_state=41),
    X=X_full2,y=y_enc,cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=41),
    scoring='precision',n_jobs=1)

#average and standard deviation of precision across folds
print('Precision: {} +/- {}'.format(scores_full_svm.mean(), scores_full_svm.std()))

Results are pretty bad, much worse than when we used regularized regression. With more time, we could consider other classifiers, such as neural networks, but for now, we'll focus on using the lasso logistic regression model because (a) it is easy to understand, (b) results could improve when we train the hyperparameters. 

# Training the Hyperparameters

We first need to train the hyperparameters to determine whether this improves the predictive power of the model. We'll use nested cross-validation to learn the optimized hyperparameters. Because we are using the elastic net, we need to determine the mixing parameter (l1_ratio) and the optimal penalty parameter (alpha, but traditionally referred to as lambda).

In [None]:
#need to initialize the search space for the parameters alpha and l1ratio
param_distn = {'alpha': np.logspace(-6, -1, 10),'l1_ratio': np.linspace(0.05, 0.95, 10)}

In [None]:
#Set up randomized search of hyperparameters for efficiency; we'll use stratified sampling as before due to imbalance
grid_search = RandomizedSearchCV(estimator=SGDClassifier(loss='log', penalty='elasticnet', max_iter=SGD_iterations,
        random_state=41),
   param_distributions=param_distn, cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=41),
    scoring='precision', n_jobs=1)

#peform the search
scores = cross_val_score(estimator=grid_search, X=X_full2, y=y_enc,
                         cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=41), scoring='precision')

In [None]:
#lets look at the scores to see if performance improved
#scores

# Fitting the Final Model and Saving It

We'll use logistic regression with the elastic net to classify complaints as receiving relief or not receiving relief. To do so, we'll fit the models using the entire data set and the optimal hyperparameters computed previously.

In [None]:
# Train the randomized hyperparameter search to identify optimal 
# hyperparameters
grid_search.fit(X_full2, y_enc)

# Train the classifier on the entire dataset using optimal hyperparameters
clf_full = SGDClassifier(
        loss='log',
        penalty='elasticnet',
        alpha=grid_search.best_params_['alpha'],
        l1_ratio=grid_search.best_params_['l1_ratio'],
        max_iter=SGD_iterations,
        random_state=41
)
clf_full.fit(X_full2, y_enc);


Save it in a pickle so that we can use it in the Flask App.

In [None]:
joblib.dump(clf_full, 'trained_classifier.pkl')