# PGP AI - AI and Machine Learning Capstone Project

DESCRIPTION

Problem Statement

Amazon is an online shopping website that now caters to millions of people everywhere. Over 34,000 consumer reviews for Amazon brand products like Kindle, Fire TV Stick and more are provided. 
The dataset has attributes like brand, categories, primary categories, reviews.title, reviews.text, and the sentiment. Sentiment is a categorical variable with three levels "Positive", "Negative“, and "Neutral". For a given unseen data, the sentiment needs to be predicted.
You are required to predict Sentiment or Satisfaction of a purchase based on multiple features and review text.

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.metrics import f1_score

import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
def load_data(path=""):
    return pd.read_csv(path)

In [None]:
def merge_data(datasets=[]):
    return pd.concat(datasets)

In [None]:
def sample_data(data, row_count=5):
    return data[:row_count]

In [None]:
def preprocess(data):
    data = [regexp_tokenize(record, pattern='[a-z]\w+') 
                   for record in data]
    print("Completed Word Tokenization")

    data = [[word for word in text 
                    if word not in stopwords.words('english')] 
                   for text in data]
    print("Completed Removal of nltk StopWords")

    data = [[word for word in text 
                    if len(word) > 1] 
                   for text in data]
    print("Completed Removal of Words of length = 1")

    data_cleaned = []

    for record in data:
        sent = ""
        for word in record:
            sent = sent + word + " "
        data_cleaned.append(sent[:-1])

    print("Completed Joining of the cleaned Text into a record")
    
    print("Analysing Tokens:")
    token_analysis(data)
    
    return data_cleaned

In [None]:
def token_analysis(data):
    terms = []

    for record in data:
        for word in record:
            terms.append(word)

    print("Total Tokens: {}".format(len(terms)))

    from collections import Counter

    counts_terms = Counter(terms)
    terms_df = pd.DataFrame(counts_terms.most_common(10), 
                            columns=['term', 'count'])
    terms_df

    terms_df.sort_values(by='count', 
                         ascending=True).plot(kind="barh", 
                                              x='term', 
                                              figsize=(12,10), 
                                              color='teal')
    plt.show()

In [None]:
def create_dataset(data, features, label, vectorizer):
    X = vectorizer.fit_transform(data[features].tolist())
    y = data[label].tolist()
    return X, y

In [None]:
def resample_dataset(X, y, sampler):
    return sampler.fit_sample(X, y)

In [None]:
def create_train_test_datasets(X, y):
    from sklearn.model_selection import train_test_split

    return train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
def create_model(X, y, model):
    return model.fit(X, y)

In [None]:
def assess_model_score(model, X_test, y_test):
    return f1_score(y_test, model.predict(X_test), average='weighted')

In [None]:
train_df = load_data("datasets/train_data.csv")
test_data_hidden_df = load_data("datasets/test_data_hidden.csv")
test_df = load_data("datasets/test_data.csv")

In [None]:
train_df.info()
test_data_hidden_df.info()
test_df.info()

In [None]:
merge_df = merge_data([train_df,test_data_hidden_df])

In [None]:
sample_data(merge_df,10)

In [None]:
merge_df.describe()

In [None]:
merge_df.info()

### Observations:
- Total Records: 5000
- Data Column for building model: reviews.text
- Label Column for identifying class: sentiment
- Unique Reviews: 4385
- Number of Output Class: 3

In [None]:
print(merge_df['sentiment'].value_counts())
merge_df['sentiment'].value_counts().plot(kind='bar', title="Count(sentiment) for Training Data")
plt.show()

In [None]:
%%time
merge_df['review_cleaned'] = preprocess(merge_df['reviews.text'].tolist())

#### Creating a Training dataset

In [None]:
X_train, y_train = create_dataset(merge_df, 'review_cleaned', 'sentiment', TfidfVectorizer(max_features=2000))

In [None]:
print(X_train.shape)
print(len(y_train))

#### Creating a Test Dataset

In [None]:
validation_df = test_df.copy()
validation_df.insert(7, 'sentiment', test_data_hidden_df['sentiment'])

In [None]:
validation_df.info()

In [None]:
%%time
validation_df['reviews_cleaned'] = preprocess(validation_df['reviews.text'].tolist())

In [None]:
print(validation_df['sentiment'].value_counts())
validation_df['sentiment'].value_counts().plot(kind='bar', title="Count(sentiment) for Test Data")
plt.show()

In [None]:
X_test, y_test = create_dataset(validation_df, 'reviews_cleaned', 'sentiment', TfidfVectorizer(max_features=2000))

In [None]:
print(X_test.shape)
print(len(y_test))

In [None]:
multi_nb = create_model(X_train, y_train, MultinomialNB())

In [None]:
print(classification_report(y_test, multi_nb.predict(X_test), digits=4))

#### Over Sampled Training Data

In [None]:
X_train_os, y_train_os = resample_dataset(X_train, y_train, RandomOverSampler())
print(X_train_os.shape)

print(pd.DataFrame(y_train_os)[0].value_counts())
plt.style.use('seaborn')
pd.DataFrame(y_train_os)[0].value_counts().plot(kind='bar', title='Over Sampled Data')
plt.show()

#### Over Sampled Test Data

In [None]:
X_test_os, y_test_os = resample_dataset(X_test, y_test, RandomOverSampler())
print(X_test_os.shape)

print(pd.DataFrame(y_test_os)[0].value_counts())
plt.style.use('seaborn')
pd.DataFrame(y_test_os)[0].value_counts().plot(kind='bar', title='Over Sampled Data')
plt.show()

#### Under Sampled Training Data

In [None]:
X_train_us, y_train_us = resample_dataset(X_train, ytrain, RandomUnderSampler())
print(X_train_us.shape)

print(pd.DataFrame(y_train_us)[0].value_counts())
plt.style.use('seaborn')
pd.DataFrame(y_train_us)[0].value_counts().plot(kind='bar', title='Under Sampled Data')
plt.show()

#### Under Sampled Test Data

In [None]:
X_test_us, y_test_us = resample_dataset(X_test, y_test, RandomUnderSampler())
print(X_test_us.shape)

print(pd.DataFrame(y_test_us)[0].value_counts())
plt.style.use('seaborn')
pd.DataFrame(y_test_us)[0].value_counts().plot(kind='bar', title='Under Sampled Data')
plt.show()

In [None]:
models = {
            'multinomial_nb': MultinomialNB(),
            'random_forest': RandomForestClassifier(),
            'XGBClassifer': XGBClassifier()
         }

samples = {
            'unsampled': [X_train, y_train, X_test, y_test],
            'over_sampled': [X_train_os, y_train_os, X_test_os, y_test_os],
            'under_sampled': [X_train_us, y_train_us, X_test_us, y_test_us]
        }

In [None]:
def create_models(models, samples):
    scores = dict()
    for model_name, model in models.items():
        for label, sample in samples.items():
            scores[model_name+'-'+label] = assess_model_score(create_model(sample[0], sample[1], model), sample[2], sample[3])
    return scores        

In [None]:
%%time
scores = pd.Series(create_models(models, samples))

In [None]:
print(scores)
scores.plot(kind='barh', title='F1 Scores', figsize=(15,6))
plt.show()

In [None]:
svm_kernels = {
    'linear': {
        'kernel':'linear', 
        'C':1, 
        'decision_function_shape':'ovo'
    },
    'rbf': {
        'kernel':'rbf', 
        'C':1,
        'gamma':1,
        'decision_function_shape':'ovo'
    },
    'poly': {
        'kernel':'poly', 
        'C':1,
        'degree':3,
        'decision_function_shape':'ovo'
    },
    'sigmoid': {
        'kernel':'sigmoid', 
        'C':1, 
        'decision_function_shape':'ovo'
    }
}
def assess_svm_kernels(samples):
    svm_scores = dict()
    for label, sample in samples.items():
        for model_name, params in svm_kernels.items():
            model = SVC().set_params(**params).fit(sample[0], sample[1])
            svm_scores[model_name+'-'+label] = assess_model_score(model, sample[2], sample[3])
    return svm_scores

In [None]:
%%time
svm_scores = pd.Series(assess_svm_kernels(samples))

In [None]:
print(svm_scores)
svm_scores.plot(kind='barh', title='F1 Scores for SVM', figsize=(15,6))
plt.show()