In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/reza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

import string
import time
import re

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

import xgboost as xgb

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from textblob import TextBlob

from joblib import dump , load

In [3]:
df = pd.read_csv('../data/amazon_reviews_small.csv', index_col=0)

In [4]:
df['review'] = df['review_headline'] + '. ' + df['review_body']

In [5]:
df['review_sentiments'] = 0

In [6]:
df.loc[df['star_rating'].isin([4, 5]), 'review_sentiments'] = 1

In [7]:
df = df.drop(columns=['review_headline', 'review_body', 'star_rating'])

In [8]:
df

Unnamed: 0,review,review_sentiments
0,Good book. This is a very good book. I recomme...,1
1,the Marenon Chronically Series. Loved all thre...,1
2,GOOD READ. Made me think about the fine line b...,1
3,excellant family devotion. I had been looking ...,1
4,"Great read. So entertaining, not a dull moment...",1
...,...,...
9995,Awesome. I haven't read a book like this in a ...,1
9996,Absolutely GREAT. I started reading it because...,1
9997,Sicko. These are two writers with some serious...,0
9998,Just too small. It was just too small for the ...,0


In [9]:
def stratified_sample_df(df, col, n_samples):
    n = min(n_samples, df[col].value_counts().min())
    df_ = df.groupby(col).apply(lambda x: x.sample(n))
    df_.index = df_.index.droplevel(0)
    return df_

In [10]:
df_small = stratified_sample_df(df, 'review_sentiments', 1000)

In [11]:
df_small['review_sentiments'].value_counts()

0    1000
1    1000
Name: review_sentiments, dtype: int64

In [12]:
lst_stopwords = stopwords.words('english')+['else',"ya"]

In [13]:
def text_cleaning(text, stem_flag=False, lem_flag=True, lst_stopwords=None):
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = text.replace('\\n',' ')
    text = text.replace('\n',' ')
    text = re.sub(r'[^\w\s]',' ',str(text).lower().strip())
    text = re.sub('[^a-z\s]', ' ', str(text).lower().strip())
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", str(text).lower().strip())
    text = re.sub(' 0 ',' ',str(text).lower().strip())
    text = re.sub(' 00 ',' ',str(text).lower().strip())
    text = re.sub(' 000 ',' ',str(text).lower().strip())
    
    lst_text = text.split()
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
    if stem_flag == True:
        ps = PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
    if lem_flag == True:
        lem = WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
    lst_text = " ".join(lst_text)
    return lst_text

In [14]:
df_small['review_clean'] = df_small['review'].apply(lambda x: text_cleaning(x, 
                                                                            stem_flag=False, 
                                                                            lem_flag=True, 
                                                                            lst_stopwords=lst_stopwords))

In [15]:
X = df_small['review_clean']
y = df_small['review_sentiments']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df_small['review'], 
                                                    df_small['review_sentiments'], 
                                                    stratify=df_small['review_sentiments'], 
                                                    test_size=0.25, 
                                                    random_state=42)

In [17]:
vect = TfidfVectorizer(max_features=10000,ngram_range=(1,2))

In [18]:
XX_train = vect.fit_transform(X_train)

In [19]:
XX_test = vect.transform(X_test)

In [20]:
vocab = vect.vocabulary_

In [21]:
len(vocab.keys())

10000

In [None]:
xgb = XGBClassifier()
model_predictions(xgb,XX_train.toarray(),y_train, XX_test.toarray(), y_test)

In [22]:
# read in data
dtrain = xgb.DMatrix(XX_train, label=y_train)
dtest = xgb.DMatrix(XX_test, label=y_test)

In [23]:
# specify parameters via map
param = {'objective':'binary:hinge', 'eval_metric':'logloss', 'colsample_bytree':0.5, 
         'max_depth':5, 'nthread':16, 'subsample':0.5}

In [24]:
num_round = 100
xgb_small = xgb.train(param, dtrain, num_boost_round=num_round, verbose_eval=True)

In [25]:
# make prediction
preds = xgb_small.predict(dtest)

In [26]:
print("Classification Report \n", classification_report(y_test, preds))

Classification Report 
               precision    recall  f1-score   support

           0       0.68      0.74      0.71       250
           1       0.71      0.65      0.68       250

    accuracy                           0.69       500
   macro avg       0.70      0.69      0.69       500
weighted avg       0.70      0.69      0.69       500



In [27]:
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:  \n", cm)

Confusion Matrix:  
 [[185  65]
 [ 88 162]]
