In [6]:
import string
import re

from matplotlib import pyplot as plt
from matplotlib import patches as mpatches
import seaborn as sns

class DataVisualizer:
	def plotConfusionMatrix(self, cm, labels, clf_name):
		fig, ax = plt.subplots()
		sns.heatmap(cm, annot=True, ax = ax, fmt = 'g')

		ax.set_xlabel('Predicted labels')
		ax.set_ylabel('Actual labels')
		ax.set_title('Confusion Matrix of {} Classifier'.format(clf_name))
		ax.xaxis.set_ticklabels(labels)
		ax.yaxis.set_ticklabels(labels, rotation = 0)
		plt.tight_layout()
		fig.savefig('plots/cm_{}.png'.format(clf_name.lower().replace(' ', '_')))
		plt.close()

	def plotClassificationReport(self, cr, labels, clf_name):
		cr_mat = []
		allowed_labels = ['negative', 'positive', 'weighted avg']

		lines = cr.split('\n')
		for line in lines[2 : -1]:
			line = line.strip()
			if line == '':
				continue
			row = re.split(r'\s{2,}', line)
			if row[0] not in allowed_labels:
				continue
			row_data = []
			row_data.append(float(row[-4]))
			row_data.append(float(row[-3]))
			row_data.append(float(row[-2]))
			row_data.append(float(row[-1]))
			cr_mat.append(row_data)

		xlabels = ['precision', 'recall', 'f1-score', 'support']
		ylabels = labels + ['weighted avg']

		fig, ax = plt.subplots()
		sns.heatmap(cr_mat, annot = True, ax = ax, fmt = 'g')

		ax.set_xlabel('Metrics')
		ax.set_ylabel('Classes')
		ax.set_title('Classification Report of {} Classifier'.format(clf_name))
		ax.xaxis.set_ticklabels(xlabels)
		ax.yaxis.set_ticklabels(ylabels, rotation = 0)
		plt.tight_layout()
		fig.savefig('plots/cr_{}.png'.format(clf_name.lower().replace(' ', '_')))
		plt.close()

	def plotClassifierPerformanceComparison(self, metrics_df, clf_names, strategy):
		fig, ax = plt.subplots()
		sns.barplot(x = 'Metrics', y = 'value', data = metrics_df, ax = ax, hue = 'Classifier')

		ax.set_xlabel('Evaluation Metrics')
		ax.set_ylabel('Classifier\'s performance')
		ax.set_title('Overall Comparison of Classifier\'s Performance (' + strategy + ')')
		pos = ax.get_position()
		ax.set_position([pos.x0, pos.y0, pos.width, pos.height])
		plt.legend(bbox_to_anchor = (1, 0.5), loc = 'best')
		plt.tight_layout()
		if strategy == 'K-Fold':
			fig.savefig('plots/classifiers_vs_metrics_kfold.png')
		else:
			fig.savefig('plots/classifiers_vs_metrics.png')
		plt.close()

	def plotClassifiersVsFeatures(self, data, clf_names, colors):
		fig, ax = plt.subplots()
		lines = []
		for d, c, clf_name in zip(data, colors, clf_names):
			sns.pointplot(x = 'x', y = 'y', data = d, ax = ax, color = c)
			lines.append(mpatches.Patch(color = c, label = clf_name))

		ax.legend(handles = lines, bbox_to_anchor=(1, 0.5), loc = 'best')
		ax.set_xlabel('K-Best Features')
		ax.set_ylabel('Classification Accuracy Scores')
		ax.set_title('Comparison of Classifier\'s Performance over Selected Features')
		fig.savefig('plots/classifiers_vs_features.png')
		plt.close()

In [7]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import sent_tokenize
from nltk import WordNetLemmatizer
from nltk import pos_tag

class NltkPreprocessor:

	def __init__(self, stopwords = None, punct = None, lower = True, strip = True):
		self.lower = lower
		self.strip = strip
		self.stopwords =  set(sw.words('english'))
		self.punct =  set(string.punctuation)
		self.lemmatizer = WordNetLemmatizer()

	def tokenize(self, document):
		tokenized_doc = []
		for sent in sent_tokenize(document):
			for token, tag in pos_tag(wordpunct_tokenize(sent)):
				token = token.lower() if self.lower else token
				token = token.strip() if self.strip else token
				token = token.strip('_0123456789') if self.strip else token

				if token in self.stopwords:
					continue

				if all(char in self.punct for char in token):
					continue

				lemma = self.lemmatize(token, tag)
				tokenized_doc.append(lemma)

		return ' '.join(tokenized_doc)

	def lemmatize(self, token, tag):
		tag = {
			'N': wn.NOUN,
			'V': wn.VERB,
			'R': wn.ADV,
			'J': wn.ADJ
		}.get(tag[0], wn.NOUN)

		return self.lemmatizer.lemmatize(token, tag)

[nltk_data] Downloading package stopwords to C:\Users\Rammohan
[nltk_data]     rao\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from time import time
import ast
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp



from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

class SentimentAnalyzer:

	def __init__(self):
		self.clf = [
			('MNB', MultinomialNB(alpha = 1.0, fit_prior = False)),
			('LR', LogisticRegression(C = 5.0, penalty = 'l2', solver = 'liblinear', max_iter = 100, dual = True)),
			('SVM', LinearSVC(C = 0.55, penalty = 'l2', max_iter = 1000, dual = True)),
			('RF', RandomForestClassifier(n_jobs = -1, n_estimators = 100, min_samples_split = 40, max_depth = 90, min_samples_leaf = 3))
		]
		self.clf_names = ['Multinomial NB', 'Logistic Regression', 'Linear SVC', 'Random Forest']

	def getInitialData(self, data_file, do_pickle):
		print('Fetching initial data...')
		t = time()

		i = 0
		df = {}
		with open(data_file, 'r') as file_handler:
			for review in file_handler.readlines():
				df[i] = ast.literal_eval(review)
				i += 1

		reviews_df = pd.DataFrame.from_dict(df, orient = 'index')
		if do_pickle:
			reviews_df.to_pickle('pickled/product_reviews.pickle')

		print('Fetching data completed!')
		print('Fetching time: ', round(time()-t, 3), 's\n')

	def preprocessData(self, reviews_df, do_pickle):
		print('Preprocessing data...')
		t = time()

		reviews_df.drop(columns = ['reviewSummary'], inplace = True)
		reviews_df['reviewRating'] = reviews_df.reviewRating.astype('int')

		reviews_df = reviews_df[reviews_df.reviewRating != 3] # Ignoring 3-star reviews -> neutral
        
		reviews_df = reviews_df.assign(sentiment = np.where(reviews_df['reviewRating'] >= 4, 1, 0)) # 1 -> Positive, 0 -> Negative
        
		nltk_preprocessor = NltkPreprocessor()
        
		with mp.Pool() as pool:
			reviews_df = reviews_df.assign(cleaned = pool.map(nltk_preprocessor.tokenize, reviews_df['reviewText'])) # Parallel processing
		
		if do_pickle:
			reviews_df.to_pickle('pickled/product_reviews_preprocessed.pickle')

		print('Preprocessing data completed!')
		print('Preprocessing time: ', round(time()-t, 3), 's\n')

	def trainTestSplit(self, reviews_df_preprocessed):
		print('Splitting data using Train-Test split...')
		t = time()
		
		X = reviews_df_preprocessed.iloc[:, -1].values
		y = reviews_df_preprocessed.iloc[:, -2].values

		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle = True)

		print('Splitting data completed!')
		print('Splitting time: ', round(time()-t, 3), 's\n')

		return X_train, X_test, y_train, y_test

	def kFoldSplit(self, reviews_df_preprocessed):
		print('Splitting data using K-Fold Cross Validation...')
		t = time()
		
		X = reviews_df_preprocessed.iloc[:, -1].values
		y = reviews_df_preprocessed.iloc[:, -2].values

		kf = KFold(n_splits = 5, random_state = 42, shuffle = True)
		train_test_indices = kf.split(X, y)

		print('Splitting data completed!')
		print('Splitting time: ', round(time()-t, 3), 's\n')

		return train_test_indices, X, y

	def trainData(self, X_train, y_train, classifier, num_features = 1000000):
		pipeline = []
		model = []

		steps = [
					('vect', TfidfVectorizer(ngram_range = (1,2), use_idf = True, sublinear_tf = True, lowercase = False, stop_words = None, preprocessor = None)),
					('select_best', SelectKBest(score_func = chi2, k = num_features))
				]

		for name, clf in classifier:
			steps.append(('clf', clf))
			pl = Pipeline(steps)
			pipeline.append(pl)

			print('Training data... Classifier ' + str(name))
			t = time()

			model.append((name, pl.fit(X_train, y_train)))

			print('Training data completed!')
			print('Training time: ', round(time()-t, 3), 's\n')

			steps.pop()

		return pipeline, model

	def predictData(self, X_test, model):
		prediction = []

		for name, m in model:
			print('Predicting Test data... Classifier ' + str(name))
			t = time()

			prediction.append((name, m.predict(X_test)))

			print('Prediction completed!')
			print('Prediction time: ', round(time()-t, 3), 's\n')

		return prediction

	def evaluate(self, y_test, prediction):
		clf_accuracy = []
		clf_precision = []
		clf_recall = []
		clf_f1 = []
		clf_roc_auc = []
		clf_cm = []
		clf_cr = []
		
		for name, pred in prediction:
			print('Evaluating results... Classifier ' + str(name))
			t = time()

			clf_accuracy.append(accuracy_score(y_test, pred))
			clf_precision.append(precision_score(y_test, pred))
			clf_recall.append(recall_score(y_test, pred))
			clf_f1.append(f1_score(y_test, pred))
			clf_roc_auc.append(roc_auc_score(y_test, pred))
			clf_cm.append(confusion_matrix(y_test, pred))
			clf_cr.append(classification_report(y_test, pred, target_names = ['negative', 'positive'], digits = 6))

			print('Results evaluated!')
			print('Evaluation time: ', round(time()-t, 3), 's\n')

		return clf_accuracy, clf_precision, clf_recall, clf_f1, clf_roc_auc, clf_cm, clf_cr

	def holdoutStrategy(self, reviews_df_preprocessed, do_pickle, do_train_data):
		print('\nHoldout Strategy...\n')

		if do_train_data:
			X_train, X_test, y_train, y_test = self.trainTestSplit(reviews_df_preprocessed)
			pipeline, model = self.trainData(X_train, y_train, self.clf)

		if do_pickle:
			with open('pickled/features_train.pickle', 'wb') as features_train:
				pickle.dump(X_train, features_train)
			with open('pickled/features_test.pickle', 'wb') as features_test:
				pickle.dump(X_test, features_test)
			with open('pickled/labels_train.pickle', 'wb') as labels_train:
				pickle.dump(y_train, labels_train)
			with open('pickled/labels_test.pickle', 'wb') as labels_test:
				pickle.dump(y_test, labels_test)
			with open('pickled/pipeline_holdout.pickle', 'wb') as pipeline_holdout:
				pickle.dump(pipeline, pipeline_holdout)
			with open('pickled/model_holdout.pickle', 'wb') as model_holdout:
				pickle.dump(model, model_holdout)

		with open('pickled/features_train.pickle', 'rb') as features_train:
			X_train = pickle.load(features_train)
		with open('pickled/features_test.pickle', 'rb') as features_test:
			X_test = pickle.load(features_test)
		with open('pickled/labels_train.pickle', 'rb') as labels_train:
			y_train = pickle.load(labels_train)
		with open('pickled/labels_test.pickle', 'rb') as labels_test:
			y_test = pickle.load(labels_test)
		with open('pickled/pipeline_holdout.pickle', 'rb') as pipeline_holdout:
			pipeline = pickle.load(pipeline_holdout)
		with open('pickled/model_holdout.pickle', 'rb') as model_holdout:
			model = pickle.load(model_holdout)

		prediction = self.predictData(X_test, model)
		clf_accuracy, clf_precision, clf_recall, clf_f1, clf_roc_auc, clf_cm, clf_cr = self.evaluate(y_test, prediction)

		if do_pickle:
			with open('pickled/metrics_cm_holdout.pickle', 'wb') as metrics_cm:
				pickle.dump(clf_cm, metrics_cm)
			with open('pickled/metrics_cr_holdout.pickle', 'wb') as metrics_cr:
				pickle.dump(clf_cr, metrics_cr)

		metrics_list = {
			'Classifier': self.clf_names,
			'Accuracy': clf_accuracy,
			'Precision': clf_precision,
			'Recall': clf_recall,
			'F1-score': clf_f1,
			'ROC AUC': clf_roc_auc
		}

		metrics_df = pd.DataFrame.from_dict(metrics_list)

		for i in range(0, len(self.clf)):
			if i == 0:
				print('======================================================\n')
			print('Evaluation metrics of Classifier ' + self.clf_names[i] + ':')
			print('Confusion Matrix: \n{}\n'.format(clf_cm[i]))
			print('Classification Report: \n{}'.format(clf_cr[i]))
			print('======================================================\n')

		print('Comparison of different metrics for the various Classifiers used:\n')
		print(metrics_df)

		if do_pickle:
			with open('pickled/metrics_dataframe.pickle', 'wb') as df:
				pickle.dump(metrics_df, df)

	def crossValidationStrategy(self, reviews_df_preprocessed, do_pickle):
		print('\nK-Fold Cross Validation Strategy...\n')

		train_test_indices, X, y = self.kFoldSplit(reviews_df_preprocessed)

		accuracy = []
		precision = []
		recall = []
		f1 = []
		roc_auc = []
		cm = []

		for i in range(0, len(self.clf)):
			accuracy.append([])
			precision.append([])
			recall.append([])
			f1.append([])
			roc_auc.append([])
			cm.append(np.zeros((2,2), dtype = 'int32'))

		for train_idx, test_idx in train_test_indices:
			X_train, y_train = X[train_idx], y[train_idx]
			X_test, y_test = X[test_idx], y[test_idx]

			_, model = self.trainData(X_train, y_train, self.clf)
			prediction = self.predictData(X_test, model)
			clf_accuracy, clf_precision, clf_recall, clf_f1, clf_roc_auc, clf_cm, _ = self.evaluate(y_test, prediction)

			for j in range(0, len(self.clf)):
				accuracy[j].append(clf_accuracy[j])
				precision[j].append(clf_precision[j])
				recall[j].append(clf_recall[j])
				f1[j].append(clf_f1[j])
				roc_auc[j].append(clf_roc_auc[j])
				cm[j] += clf_cm[j]

		acc = []
		prec = []
		rec = []
		f1_score = []
		auc = []
		for i in range(0, len(self.clf)):
			if i == 0:
				print('======================================================\n')
			print('Evaluation metrics of Classifier ' + self.clf_names[i] + ':')
			print('Accuracy: {}'.format(np.mean(accuracy[i])))
			print('Precision: {}'.format(np.mean(precision[i])))
			print('Recall: {}'.format(np.mean(recall[i])))
			print('F1-score: {}'.format(np.mean(f1[i])))
			print('ROC AUC: {}'.format(np.mean(roc_auc[i])))
			print('Confusion Matrix: \n{}\n'.format(cm[i]))
			print('======================================================\n')
			acc.append(np.mean(accuracy[i]))
            prec.append(np.mean(precision[i]))
			rec.append(np.mean(recall[i]))
			f1_score.append(np.mean(f1[i]))
			auc.append(np.mean(roc_auc[i]))

		metrics_list = {
			'Classifier': self.clf_names,
			'Accuracy': clf_accuracy,
			'Precision': clf_precision,
			'Recall': clf_recall,
			'F1-score': clf_f1,
			'ROC AUC': clf_roc_auc
		}

		metrics_df = pd.DataFrame.from_dict(metrics_list)

		print('Comparison of different metrics for the various Classifiers used:\n')
		print(metrics_df)

		if do_pickle:
			with open('pickled/metrics_dataframe_kfold.pickle', 'wb') as df_kfold:
				pickle.dump(metrics_df, df_kfold)

In [10]:


import pickle

class Utility:

	def __init__(self):
		self.sentiment = SentimentAnalyzer()
		self.clf = self.sentiment.clf

	def classifiersVsFeatures(self):
		with open('pickled/features_train.pickle', 'rb') as features_train:
			X_train = pickle.load(features_train)
		with open('pickled/features_test.pickle', 'rb') as features_test:
			X_test = pickle.load(features_test)
		with open('pickled/labels_train.pickle', 'rb') as labels_train:
			y_train = pickle.load(labels_train)
		with open('pickled/labels_test.pickle', 'rb') as labels_test:
			y_test = pickle.load(labels_test)

		num_features = [10000, 50000, 100000, 500000, 1000000]
		
		acc = []
		for i in range(0, len(self.clf)):
			acc.append([])

		for k in num_features:
			_, model = self.sentiment.trainData(X_train, y_train, self.clf, k)
			prediction = self.sentiment.predictData(X_test, model)
			clf_metrics = self.sentiment.evaluate(y_test, prediction)

			for j in range(0, len(self.clf)):
				print(clf_metrics[0][j])
				acc[j].append(clf_metrics[0][j]) # Append the accuracy of the classifier for each k

		data = []
		for i in range (0, len(self.clf)):
			data.append({'x': num_features, 'y': acc[i]})

		return data

	def showTopFeatures(self, pipeline, n = 20):
		vectorizer = pipeline.named_steps['vect']
		clf = pipeline.named_steps['clf']
		feature_names = vectorizer.get_feature_names()

		coefs = sorted(zip(clf.coef_[0], feature_names), reverse = True)
		topn = zip(coefs[:n], coefs[: -(n+1): -1])
		
		top_features = []
		for (coef_p, feature_p), (coef_n, feature_n) in topn:
			top_features.append('{:0.4f}{: >25}    {:0.4f}{: >25}'.format(coef_p, feature_p, coef_n, feature_n))

		return '\n'.join(top_features)

In [11]:


from pathlib import Path

import pickle
import pandas as pd


def analyzeVisualize(sentiment):
	with open('pickled/pipeline_holdout.pickle', 'rb') as pipeline_holdout:
		pipeline = pickle.load(pipeline_holdout)
	with open('pickled/metrics_cm_holdout.pickle', 'rb') as metrics_cm:
		clf_cm = pickle.load(metrics_cm)
	with open('pickled/metrics_cr_holdout.pickle', 'rb') as metrics_cr:
		clf_cr = pickle.load(metrics_cr)
	with open('pickled/metrics_dataframe.pickle', 'rb') as df:
		metrics_df = pickle.load(df)
	with open('pickled/metrics_dataframe_kfold.pickle', 'rb') as df:
		metrics_df_kfold = pickle.load(df)

	clf_svc = pipeline[2]
	clf_names = sentiment.clf_names
	labels = ['negative', 'positive']

	visualizer = DataVisualizer()

	for cm, cr, name in zip(clf_cm, clf_cr, clf_names):
		visualizer.plotConfusionMatrix(cm, labels, name)
		visualizer.plotClassificationReport(cr, labels, name)
	
	metrics_df.rename(columns = {"Accuracy": "value_Accuracy", "Precision": "value_Precision", "Recall": "value_Recall", "F1-score": "value_F1-score", "ROC AUC": "value_ROC AUC"}, inplace = True)
	metrics_df['id'] = metrics_df.index
	metrics_df_long = pd.wide_to_long(metrics_df, stubnames = 'value', i = 'id', j = 'id_m', sep = '_', suffix = r'[a-zA-Z0-9_\- ]+')
	metrics_df_long['Metrics'] = metrics_df_long.index.get_level_values('id_m')
	visualizer.plotClassifierPerformanceComparison(metrics_df_long, clf_names, 'Holdout')
	
	metrics_df_kfold.rename(columns = {"Accuracy": "value_Accuracy", "Precision": "value_Precision", "Recall": "value_Recall", "F1-score": "value_F1-score", "ROC AUC": "value_ROC AUC"}, inplace = True)
	metrics_df_kfold['id'] = metrics_df_kfold.index
	metrics_df_kfold_long = pd.wide_to_long(metrics_df_kfold, stubnames = 'value', i = 'id', j = 'id_m', sep = '_', suffix = r'[a-zA-Z0-9_\- ]+')
	metrics_df_kfold_long['Metrics'] = metrics_df_kfold_long.index.get_level_values('id_m')
	visualizer.plotClassifierPerformanceComparison(metrics_df_kfold_long, clf_names, 'K-Fold')
	
	util = Utility()

	data = util.classifiersVsFeatures()
	colors = ['blue', 'yellow', 'red', 'green']
	visualizer.plotClassifiersVsFeatures(data, clf_names, colors)

	top_features = util.showTopFeatures(clf_svc, n = 30)
	print('The 30 most informative features for both positive and negative coefficients:\n')
	print(top_features)

if __name__ == "__main__":

	do_pickle = False
	do_train_data = False
	do_fetch_data = False
	do_preprocess_data = False
	do_cross_validation_strategy = False
	do_holdout_strategy = False
	do_analyze_visualize = False


	Path('./pickled').mkdir(exist_ok = True)
	Path('./plots').mkdir(exist_ok = True)

	if do_fetch_data or do_preprocess_data or do_cross_validation_strategy or do_holdout_strategy or do_analyze_visualize:
		sentiment = SentimentAnalyzer()

	if do_fetch_data:
		sentiment.getInitialData('product_reviews.json', do_pickle)

	if do_preprocess_data:
		reviews_df = pd.read_pickle('pickled/product_reviews.pickle')
		sentiment.preprocessData(reviews_df, do_pickle)

	if do_cross_validation_strategy or do_holdout_strategy:
		reviews_df_preprocessed = pd.read_pickle('pickled/product_reviews_preprocessed.pickle')
		print(reviews_df_preprocessed.isnull().values.sum()) 

	if do_cross_validation_strategy:
		sentiment.crossValidationStrategy(reviews_df_preprocessed, do_pickle)
	
	if do_holdout_strategy: 
		sentiment.holdoutStrategy(reviews_df_preprocessed, do_pickle, do_train_data)

	if do_analyze_visualize:
		analyzeVisualize(sentiment)
	
	with open('pickled/model_holdout.pickle', 'rb') as model_holdout:
		model = pickle.load(model_holdout)

	model_svc = model[2][1] 
	
	print('\nEnter your review:')
	user_review = input()
	verdict = 'Positive' if model_svc.predict([user_review]) == 1 else 'Negative'
	print('\nPredicted sentiment: '+ verdict)


Enter your review:
product is bad

Predicted sentiment: Negative
