### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import nltk.data
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

### Define useful functions

In [12]:
def text_process(review_sentence):
    return [word.lower() for word in review_sentence.split()]

### Import dataset

In [50]:
df_negative_sentences_tagged = pd.read_csv('./datasets/df_negative_sentences_part1_tagged.csv',lineterminator='\n')

### Drop NaN for review_topic

In [51]:
df_negative_sentences_tagged_subset = df_negative_sentences_tagged[df_negative_sentences_tagged['review_topic'].notna()]

In [52]:
df_negative_sentences_tagged_subset.groupby('review_topic').describe()

Unnamed: 0_level_0,review_score_badge,review_score_badge,review_score_badge,review_score_badge,review_score_badge,review_score_badge,review_score_badge,review_score_badge,review_label,review_label,review_label,review_label,review_label,review_label,review_label,review_label
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
review_topic,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
AH,10.0,8.07,1.36955,6.7,6.7,8.15,8.8,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B,20.0,7.97,1.388714,5.0,7.1,7.5,9.05,10.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BF,22.0,8.204545,1.248419,5.4,7.5,8.0,9.0,10.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F,11.0,8.309091,0.897167,7.1,7.95,8.3,8.55,10.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H,29.0,8.134483,1.682531,2.5,7.5,8.0,9.2,10.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L,8.0,8.2,1.35119,5.4,7.5,8.8,9.0,9.6,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N,23.0,7.752174,1.518932,5.0,6.7,7.5,9.1,10.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
O,39.0,9.115385,1.406851,2.5,8.55,10.0,10.0,10.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P,4.0,9.2,0.864099,8.0,8.9,9.4,9.7,10.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R,42.0,7.904762,1.703075,2.5,7.1,8.15,9.15,10.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modeling

## Create model pipeline: Vectorization (BoW), TF-IDF, Naive Bayes Classifier

In [60]:
model_pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    #('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB())  # train on TF-IDF vectors w/ Naive Bayes classifier
])

## Train-validation split

In [61]:
df_features = df_negative_sentences_tagged_subset.drop('review_topic', axis=1)

In [62]:
df_train, df_valid, label_train, label_valid = train_test_split(df_features, df_negative_sentences_tagged_subset['review_topic'], test_size=0.2)

## Train model

In [63]:
model_pipeline.fit(df_train['review_sentence'],label_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x7f09db9c5040>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

## Validate model

In [64]:
predictions = model_pipeline.predict(df_valid['review_sentence'])

## Print classification report

In [65]:
print(classification_report(label_valid,predictions))

              precision    recall  f1-score   support

          AH       0.00      0.00      0.00         2
           B       0.00      0.00      0.00         2
          BF       0.00      0.00      0.00         5
           F       0.00      0.00      0.00         2
           H       0.00      0.00      0.00         2
           L       0.00      0.00      0.00         1
           N       0.33      0.25      0.29         4
           O       0.71      0.56      0.63         9
           P       0.00      0.00      0.00         2
           R       0.41      0.85      0.55        13
          SM       0.00      0.00      0.00         1
          ST       0.00      0.00      0.00         2

    accuracy                           0.38        45
   macro avg       0.12      0.14      0.12        45
weighted avg       0.29      0.38      0.31        45



  _warn_prf(average, modifier, msg_start, len(result))


## Print confusion matrix

In [66]:
valid_cm = confusion_matrix(label_valid,predictions)

In [67]:
valid_cm

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  1,  0,  4,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  3,  0,  0],
       [ 0,  0,  0,  0,  2,  0,  0,  5,  0,  2,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  2,  0,  0,  0,  0, 11,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0]])