In [1]:
# !pip install xgboost

In [2]:
# !pip install imblearn

In [3]:
# !pip install textblob

In [4]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

import string
import time
import re

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

import xgboost as xgb

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# from imblearn.over_sampling import SMOTE , SMOTEN

from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from textblob import TextBlob

from joblib import dump , load

In [6]:
df = pd.read_csv('../data/amazon_large_stratified.csv', index_col=0)

In [7]:
df

Unnamed: 0,review,review_sentiments
3837169,Disappointing treatment of a great album. This...,0
2882480,Good read. I have read all the books in this s...,1
4587996,"NOPE.. Just my opinion obviously, but every ti...",0
196423,"Passing gas was the highlight, and I HATED th...",0
94421,not believable **SPOILERS**. I really like his...,0
...,...,...
4996769,"Yuck!. Cardboard characters, stilted dialogue,...",0
1159844,Amazing book....... This was a rough story to ...,1
3565922,Great read. What a great story and to learn is...,1
5053743,tiresome tale. the purpose of the book seems t...,0


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
lst_stopwords = stopwords.words('english')+['else',"ya"]

In [10]:
def text_cleaning(text, stem_flag=False, lem_flag=True, lst_stopwords=None):
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = text.replace('\\n',' ')
    text = text.replace('\n',' ')
    text = re.sub(r'[^\w\s]',' ',str(text).lower().strip())
    text = re.sub('[^a-z\s]', ' ', str(text).lower().strip())
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", str(text).lower().strip())
    text = re.sub(' 0 ',' ',str(text).lower().strip())
    text = re.sub(' 00 ',' ',str(text).lower().strip())
    text = re.sub(' 000 ',' ',str(text).lower().strip())
    
    lst_text = text.split()
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
    if stem_flag == True:
        ps = PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
    if lem_flag == True:
        lem = WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
    lst_text = " ".join(lst_text)
    return lst_text

In [11]:
df['review_clean'] = df['review'].apply(lambda x: text_cleaning(x, 
                                                                stem_flag=False, 
                                                                lem_flag=True, 
                                                                lst_stopwords=lst_stopwords))

In [12]:
X = df['review_clean']
y = df['review_sentiments']

In [13]:
X_train, X_temp, y_train, y_temp = train_test_split(X, 
                                                    y, 
                                                    stratify=y, 
                                                    test_size=0.5, 
                                                    random_state=42)

In [14]:
X_val, X_test, y_val, y_test = train_test_split(X_temp, 
                                                y_temp, 
                                                stratify=y_temp, 
                                                test_size=250000, 
                                                random_state=42)

In [15]:
X_train = pd.concat([X_train, X_val], axis=0)

In [16]:
y_train = pd.concat([y_train, y_val], axis=0)

In [17]:
vect = TfidfVectorizer(max_features=10000,ngram_range=(1,2))

In [18]:
XX_train = vect.fit_transform(X_train)

In [19]:
XX_test = vect.transform(X_test)

In [20]:
vocab = vect.vocabulary_

In [21]:
len(vocab.keys())

10000

In [22]:
# read in data
dtrain = xgb.DMatrix(XX_train, label=y_train)
dtest = xgb.DMatrix(XX_test, label=y_test)

In [23]:
# specify parameters via map
param = {'objective':'binary:hinge', 'eval_metric':'logloss', 'colsample_bytree':0.5, 'max_depth':5, 'nthread':16, 'subsample':0.5}

In [24]:
num_round = 100
bst = xgb.train(param, dtrain, num_boost_round=num_round, verbose_eval=True)

In [25]:
# make prediction
preds = bst.predict(dtest)

In [26]:
print("Classification Report \n", classification_report(y_test, preds))

Classification Report 
               precision    recall  f1-score   support

           0       0.80      0.86      0.83    125000
           1       0.85      0.78      0.81    125000

    accuracy                           0.82    250000
   macro avg       0.82      0.82      0.82    250000
weighted avg       0.82      0.82      0.82    250000



In [27]:
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:  \n", cm)

Confusion Matrix:  
 [[107381  17619]
 [ 27392  97608]]


In [28]:
# xgb.cv(param, dtrain, num_boost_round=100, nfold=5)