In [1]:
#Standard packages
import pandas as pd
import numpy as np
import matplotlib

# Scikit Learn
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB

#Natural Language Toolkit
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

#Plotting
from matplotlib import pyplot as plt
# Allow plots in Notebook
%matplotlib inline

In [2]:
# Import the transcript file and assign variable names to the columns
df = pd.read_csv("D:/Documents/Data/case_study_data_tiny.csv")
y = df['product_group']
X = df['text']

In [6]:
df_sub = df
df_sub = df_sub[pd.notnull(df['product_group'])]
df_sub.columns = ['complaint_id','product_group','text']
df_sub['category_id'] = df_sub['product_group'].factorize()[0]
category_id_df = df_sub[['product_group', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'product_group']].values)

from sklearn import metrics
vectorizer = CountVectorizer()
y_test_counts = vectorizer.fit_transform(y_test)
features = vectorizer.get_feature_names()
labels = df_sub['category_id']

In [9]:
# Get train and test sets (sampled from transcript file)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
#X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)


## BOW SETUP

In [10]:
# Create BOW in one step
# Set up the BOW Transformer
vectorizer = CountVectorizer()
bow_transformer_train=CountVectorizer().fit(X_train)
bow_transformer_test=CountVectorizer().fit(X_test)

# Generate the BOW
bow_transformer=vectorizer.fit(X_train)
text_bow_train=bow_transformer.transform(X_train)#ONLY TRAINING DATA
text_bow_test=bow_transformer.transform(X_test)#ONLY TEST DATA

## Classifier Generator

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC

models = [
    #RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    SGDClassifier(),
    RidgeClassifier(tol=1e-2, solver="sag"),
    Perceptron(max_iter=25),
    KNeighborsClassifier(n_neighbors=10),
    LogisticRegression(random_state=0),
]

  from numpy.core.umath_tests import inner1d


## Classifier Comparison Plot

In [13]:
import seaborn as sns
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
features = vectorizer.get_feature_names()
labels = y_train #CORRECT LABEL
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
plt.figure(figsize=[12,6])
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, hue='model_name',
              size=12, jitter=True, edgecolor="gray", linewidth=2)
plt.title('Classifier Accuracy Comparison', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.xlabel('Classifier Name', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.show()

ValueError: Found input variables with inconsistent numbers of samples: [37009, 34999]

In [None]:
# Model comparison scores
cv_df.groupby('model_name').accuracy.mean()

In [None]:
# Instantiate the best model
svm_clf = LinearSVC()
svm_clf.fit(text_bow_train, y_train)

In [None]:
# CREATE THE CONFUSION MATRIX HEATMAP
# mnb_class = MultinomialNB()
import seaborn as sns
# Make predictions with the test set
y_pred = svm_clf.predict(text_bow_test)
# Extract the features and name them target_names
features = vectorizer.get_feature_names()
print(metrics.classification_report(y_test, predicted, target_names=features))
# Load the confusion matrix library and create a confusion matrix 
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
# Create a heatmap chart of the confusion matrix using the seaborn library
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=features, yticklabels=features)
# Label the axes and disply the heatmap
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()