# Mount google drive

In [None]:
from google.colab import drive
drive.mount('drive')

#Import required libraries

In [None]:
! pip install nltk
! pip install wordcloud

In [None]:
# General packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# NLP packages
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from wordcloud import WordCloud

# Modeling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from pylab import rcParams
import warnings
warnings.filterwarnings("ignore")
rcParams['figure.figsize'] = 14, 6
plt.style.use('ggplot')
import re



# Read data from drive

In [None]:
amazon_reviews = pd.read_csv("/content/drive/MyDrive/Colab-Data Science/DataSets/amazonProductReviews.csv")

In [None]:
amazon_reviews.head()

# Understanding the data

In [None]:
## Getting the number of words by splitting them by a space
words_per_review = amazon_reviews.review_comments.apply(lambda x: len(x.split(" ")))
words_per_review.hist(bins = 50)
plt.xlabel('Review Length (words)')
plt.ylabel('Frequency')
plt.show()

In [None]:
print('Average words:', words_per_review.mean())
print('Skewness:', words_per_review.skew())

In [None]:
# Extract rating points from review_ratings
r=r'^\D*(\d+)'
amazon_reviews["points"]= amazon_reviews["review_ratings"].str.extract(r)
amazon_reviews = amazon_reviews.astype({"points": int})
amazon_reviews.head()

In [None]:
# Derive Tag feature w.r.t points 
amazon_reviews.loc[(amazon_reviews['points'] == 1) | (amazon_reviews['points'] == 2), 'Tag'] = -1 #'Negative'
amazon_reviews.loc[(amazon_reviews['points'] == 3), 'Tag'] = 0 #'Neutral'  
amazon_reviews.loc[(amazon_reviews['points'] == 4)| (amazon_reviews['points'] == 5), 'Tag'] = 1 #'Positive'


In [None]:
# The data distribution with respect to points 
amazon_reviews.value_counts("Tag")

In [None]:
# The data distribution with respect to points %
percent_val = 100 * amazon_reviews['Tag'].value_counts()/len(amazon_reviews)
percent_val

In [None]:
percent_val.plot.bar()
plt.show()

In [None]:
# Text Visualization 
word_cloud_text = ''.join(amazon_reviews['review_comments'])

wordcloud = WordCloud(max_font_size=100, # Maximum font size for the largest word
                      max_words=100, # The maximum number of words
                      background_color="white", # Background color for the word cloud image
                      scale = 10, # Scaling between computation and drawing
                      width=800, # Width of the canvas
                      height=400 # Height of the canvas
                     ).generate(word_cloud_text)

plt.figure()
plt.imshow(wordcloud, 
           interpolation="bilinear") # to make the displayed image appear more smoothly
plt.axis("off")
plt.show()

# Preprocessings - Converts to lower-case, removes square bracket, removes numbers and punctuation

In [None]:
amazon_reviews.head(3)

In [None]:
amazon_reviews.drop(columns = ['id', 'review_titles', 'review_ratings','points','reviewer'], inplace = True)
amazon_reviews.to_csv('../content/drive/MyDrive/Colab-Data Science/DataSets/review_silverAnalysis.csv', index = True)

amazon_reviews.head(3)

In [None]:
amazon_reviews.Tag.value_counts()

In [None]:
# Loading golden analysis data
#amazon_reviews = pd.read_csv("/content/drive/MyDrive/Colab-Data Science/DataSets/review_gold_Analysis.csv")
#amazon_reviews.Tag.value_counts()


In [None]:
amazon_reviews['reviews_text_new'] = amazon_reviews['review_comments'].str.lower()


In [None]:
from nltk import word_tokenize
nltk.download('punkt')

In [None]:
# Tokenization
token_lists = [word_tokenize(each) for each in amazon_reviews['review_comments']]
tokens = [item for sublist in token_lists for item in sublist]
print(" Total token before lowercase: ",len(set(tokens)))

# For reviews converted to lowe case
token_lists_lower = [word_tokenize(each) for each in amazon_reviews['reviews_text_new']]
tokens_lower = [item for sublist in token_lists_lower for item in sublist]
print(" After token before lowercase: ",len(set(tokens_lower)))

In [None]:
#Removing special character
spl_chars = amazon_reviews['reviews_text_new'].apply(lambda review:[char for char in list(review) if not char.isalnum() and char != ' '])

## Making single list for special character
special_char_list = [item for sublist in spl_chars for item in sublist]

# distinct special characters
len(set(special_char_list))

In [None]:
review_backup = amazon_reviews['reviews_text_new'].copy()
amazon_reviews['reviews_text_new'] = amazon_reviews['reviews_text_new'].str.replace(r'[^A-Za-z0-9 ]+', ' ')

In [None]:
token_lists = [word_tokenize(each) for each in amazon_reviews['reviews_text_new']]
tokens = [item for sublist in token_lists for item in sublist]
print("Number of unique tokens now: ",len(set(tokens)))

#Stopwords and high/low frequency words

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
noise_words = []
eng_stop_words = stopwords.words('english')
eng_stop_words

In [None]:
stop_words = set(eng_stop_words)
without_stop_words = []
stopword = []
sentence = amazon_reviews['reviews_text_new'][0]
words = nltk.word_tokenize(sentence)

for word in words:
    if word in stop_words:
        stopword.append(word)
    else:
        without_stop_words.append(word)

print('-- Original Sentence --\n', sentence)
print('\n-- Stopwords in the sentence --\n', stopword)
print('\n-- Non-stopwords in the sentence --\n', without_stop_words)

In [None]:
# Remove Stopwords
def stopwords_removal(stop_words, sentence):
    return [word for word in nltk.word_tokenize(sentence) if word not in stop_words]

amazon_reviews['reviews_text_nonstop'] = amazon_reviews['reviews_text_new'].apply(lambda row: stopwords_removal(stop_words, row))
amazon_reviews.head(5)

#Stemming & lemmatization

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer # Common stemmers
from nltk.stem import WordNetLemmatizer # Common Lematizer
nltk.download('wordnet')
from nltk.corpus import wordnet

porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
print("Lancaster Stemmer")
print(lancaster.stem("trouble"))
print(lancaster.stem("troubling"))
print(lancaster.stem("troubled"))

# Provide a word to be lemmatized
print("WordNet Lemmatizer")
print(lemmatizer.lemmatize("trouble", wordnet.NOUN))
print(lemmatizer.lemmatize("troubling", wordnet.VERB))
print(lemmatizer.lemmatize("troubled", wordnet.VERB))

#Model training with BOW

In [None]:
# The following code creates a word-document matrix.
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X = vec.fit_transform(amazon_reviews['reviews_text_new'])
df = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())
df.head()

In [None]:
bow_counts = CountVectorizer(tokenizer= word_tokenize, # type of tokenization
                             stop_words=noise_words, # List of stopwords
                             ngram_range=(1,1)) # number of n-grams

bow_data = bow_counts.fit_transform(amazon_reviews['reviews_text_new'])

In [None]:
bow_data

In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_data, # Features
                                                                    amazon_reviews['Tag'], # Target variable
                                                                    test_size = 0.2, # 20% test size
                                                                    random_state = 0) # random state for replication purposes

In [None]:
y_test_bow.value_counts()/y_test_bow.shape[0]

In [None]:
#Applying logistic regression
lr_model_all = LogisticRegression() # Logistic regression
lr_model_all.fit(X_train_bow, y_train_bow) # Fitting a logistic regression model

from sklearn.metrics import confusion_matrix
predictions = lr_model_all.predict(X_test_bow)
confusion_matrix(predictions, y_test_bow)



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(predictions, y_test_bow))
print("Precision : ", precision_score(predictions, y_test_bow, average = 'weighted'))
print("Recall : ", recall_score(predictions, y_test_bow, average = 'weighted'))

#Model training with TF-IDF

In [None]:
amazon_reviews.head(2)

In [None]:
from sklearn.model_selection import train_test_split

iv = amazon_reviews.reviews_text_new 	
dv = amazon_reviews.Tag

IV_train, IV_test, DV_train, DV_test = train_test_split(iv, dv, test_size = 0.2, random_state = 225)

print('IV_train :', len(IV_train))
print('IV_test  :', len(IV_test))
print('DV_train :', len(DV_train))
print('DV_test  :', len(DV_test))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(use_idf=True,norm='l2',smooth_idf=True)

from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(cv=10,scoring='accuracy',random_state=0,n_jobs=-1,verbose=3,max_iter=300,solver='newton-cg',multi_class='multinomial')

# from sklearn import svm
# model = svm.SVC(decision_function_shape='ovo')

# from sklearn.ensemble import AdaBoostClassifier
# model = AdaBoostClassifier()

# from sklearn.naive_bayes import MultinomialNB
# model = MultinomialNB()

# from xgboost import  XGBClassifier
# model = XGBClassifier(eta=.01,alpha=50) 

# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(n_estimators=20, n_jobs=-1)

# from sklearn import svm
# model = svm.SVC(decision_function_shape='ovo')

from sklearn.pipeline import Pipeline

In [None]:
model = Pipeline([('vectorizer',tvec),('classifier',model)])
model.fit(IV_train, DV_train)
from sklearn.metrics import confusion_matrix

predictions = model.predict(IV_test)
confusion_matrix(predictions, DV_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(predictions, DV_test))
print("Precision : ", precision_score(predictions, DV_test, average = 'weighted'))
print("Recall : ", recall_score(predictions, DV_test, average = 'weighted'))

In [None]:
#import pickle
#pickle.dump(model,open('/content/drive/MyDrive/Colab-Data Science/DataSets/model.pkl','wb'))

#Testing Custome review

In [None]:


example = ["The battery life is so bad"]
result = model.predict(example)

print(model.predict_proba(example)[0])
print(" Negative(-1)={}".format(model.predict_proba(example)[0][0]))
print(" Neutral(0)={}".format(model.predict_proba(example)[0][1]))
print(" Positive(1)={}".format(model.predict_proba(example)[0][2]))

if result==0:
  print("Neutral")
elif result==1:
  print("Positive")
else:
  print("Negative")