In [1]:
#Standard packages
import pandas as pd
import numpy as np
import matplotlib

#Natural Language Toolkit
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns
# Allow plots in Notebook
%matplotlib inline

In [3]:
from sklearn.model_selection import train_test_split

dta = pd.read_csv("D:/Documents/Data/case_study_data_sm.csv")
corps = dta[['product_group','text']]
X = corps.text
y = corps.product_group
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [5]:
df = corps
df = df[pd.notnull(df['product_group'])]
df.columns = ['product_group','text']
df['category_id'] = df['product_group'].factorize()[0]
category_id_df = df[['product_group', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'product_group']].values)
df.head()

Unnamed: 0,product_group,text,category_id
0,loan,Two private loans have with them very discharg...,0
1,loan,attach a letter dated explaining dropped the ...,0
2,credit_reporting,Please see attached Complaint Number against c...,1
3,loan,feel as though 've been subjected to predator...,0
4,loan,a veteran living on social security and cosi...,0


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVectorizer()
#DO NOT CHANGE#
tfidf= TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 3), stop_words='english')
features = tfidf.fit_transform(X_train) #CORRECT TRANSFORM
labels = y_train #CORRECT LABEL

In [None]:
vectorizer = CountVectorizer()
bow_transformer_train=CountVectorizer().fit(X_train)
bow_transformer_test=CountVectorizer().fit(X_test)

bow_transformer=vectorizer.fit(X_train)
text_bow_train=bow_transformer.transform(X_train)#ONLY TRAINING DATA
text_bow_test=bow_transformer.transform(X_test)#ONLY TEST DATA

In [None]:
from io import StringIO
df = corps
df = df[pd.notnull(df['product_group'])]
df.columns = ['product_group', 'text']
df['complaint_id'] = df['product_group'].factorize()[0]
category_id_df = df[['product_group', 'complaint_id']].drop_duplicates().sort_values('complaint_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['complaint_id', 'product_group']].values)

## Classifier Comparison

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn import metrics

### Refresh Train and Test BOWs

In [None]:
# transforming into Bag-of-Words and hence textual data to numeric..
text_bow_train=bow_transformer.transform(X_train)#ONLY TRAINING DATA

In [None]:
# transforming into Bag-of-Words and hence textual data to numeric..
text_bow_test=bow_transformer.transform(X_test)#TEST DATA

### Define Classifiers

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
models = [
    #RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    SGDClassifier(),
    RidgeClassifier(tol=1e-2, solver="sag"),
    Perceptron(max_iter=25),
    #KNeighborsClassifier(n_neighbors=10),
    LogisticRegression(random_state=0),
]

### Execute the Classifiers

In [None]:
import seaborn as sns
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
plt.figure(figsize=[12,6])
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, hue='model_name',
              size=12, jitter=True, edgecolor="gray", linewidth=2)
plt.title('Classifier Accuracy Comparison', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.xlabel('Classifier Name', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.show()

### Anlysis of Model Performance

In [None]:
cv_df.groupby('model_name').accuracy.mean()

### BOW Analysis with a Support Vector Machine (SVM)

In [None]:
svm_clf = LinearSVC()
svm_clf.fit(text_bow_train, y_train)

In [None]:
svm_score = svm_clf.score(text_bow_train, y_train)
svm_score

In [None]:
y_pred = svm_clf.predict(text_bow_test)
y_pred_score = svm_clf.score(text_bow_test,y_test)
y_pred_score

### Confusion Matrices

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.product_group.values, yticklabels=category_id_df.product_group.values)
plt.title('Product Group Complaints Confution Matrix', fontsize = 15)
plt.ylabel('Actual', fontsize = 12)
plt.xlabel('Predicted', fontsize = 12)
plt.show()

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
result = svm_clf.score(text_bow_test, y_test)
print("Accuracy: %.3f%%" % (result*100.0))
y_pred = svm_clf.predict(text_bow_test)
print("F1 Score: ", f1_score(y_test, y_pred, average="macro"))
print("Precision Score: ", precision_score(y_test, y_pred, average="macro"))
print("Recall Score: ", recall_score(y_test, y_pred, average="macro")) 


### Classification Report (Precision, Recall and F1–Score)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))