# Getting Started with Sentiment Analysis

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn import metrics


import re, string
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec, KeyedVectors


import pickle
from tqdm import tqdm
import math as math
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('precision', 5)
pd.options.display.float_format = '{:20,.2f}'.format
np.set_printoptions(suppress =True) 
print(nltk.__version__)
from gensim import __version__
print(__version__)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
con = sqlite3.connect('/kaggle/input/amazon-fine-food-reviews/database.sqlite') 
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews  WHERE Score != 3 LIMIT 50000""", con)
filtered_data.head()

In [None]:
filtered_data.columns

In [None]:
for i in ['ProductId', 'UserId', 'ProfileName', 'Score']:
    print('No of unique {} values : {}'.format(i,filtered_data[i].nunique()))
    if i == 'Score':
        print(filtered_data[i].value_counts())

### Looks like there is a Class Imbalance between #+ve vs #-ve reviews. Appropriate Oversampling or Undersampling strategy must be tried

In [None]:
#filtered_data['Score'].apply(lambda x:0 if x<3 else 1).head()
filtered_data['Score'] = filtered_data['Score'].apply(lambda x:'negative' if x<3 else 'positive')
filtered_data['Score'].value_counts()/len(filtered_data)

In [None]:
query = """
SELECT UserId, ProductId, ProfileName, Time, Score, Text, COUNT(*) count_duplicate
FROM Reviews
GROUP BY UserId, ProfileName, Time, Text
HAVING COUNT(*)>1
"""
df_duplicates = pd.read_sql_query(query, con)
df_duplicates.sort_values(by='count_duplicate', ascending=False, inplace=True)
print(df_duplicates.shape)
df_duplicates.head()

### Removing Duplicate reviews (Review gets Duplicated for each Product Attribute eg: each Color/Size of Shirt or each Flavour of Ice Cream etc)

In [None]:
query = """
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""" 
pd.read_sql_query(query, con)

In [None]:
df_sorted = filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
df_deduplicated = df_sorted.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
print(df_deduplicated.shape)
df_deduplicated[df_deduplicated.UserId == "AR5J8UI46CURR"]

### What % of rows were duplicates 

In [None]:
np.round((1.0-(df_deduplicated['Id'].size*1.0)/(filtered_data['Id'].size*1.0))*100, 2)

### Checking if there are any anomolous rows with Helpful numerator(x) greater than Helpful Denominator (x+y) as it is imposible

In [None]:
df_deduplicated[df_deduplicated.HelpfulnessNumerator > df_deduplicated.HelpfulnessDenominator].head()

In [None]:
df_deduplicated = df_deduplicated[df_deduplicated.HelpfulnessNumerator <= df_deduplicated.HelpfulnessDenominator]

### Text data preprocessing

In [None]:
# Checking random reviews
sent_0, sent_1000, sent_1500, sent_4900 = [],[],[],[]
dict_randomreview = {0:sent_0, 1000:sent_1000, 1500:sent_1500, 4900:sent_4900}
for key,value in dict_randomreview.items():
    value = df_deduplicated['Text'].values[key]
    print(value)
    print('='*50)

### Removing urls from text python
> https://stackoverflow.com/a/40823105/4084039

In [None]:
#dict_randomreview = {0:sent_0, 1000:sent_1000, 1500:sent_1500, 4900:sent_4900}
for key,value in dict_randomreview.items():
    value = df_deduplicated['Text'].values[key]
    dict_randomreview[key] = re.sub(r"http\S+", "", value)
    print(dict_randomreview[key])
    print('='*50)

### Removing all HTML tags from each element
> https://stackoverflow.com/questions/16206380/python-beautifulsoup-how-to-remove-all-tags-from-an-element

In [None]:
from bs4 import BeautifulSoup

#dict_randomreview = {0:sent_0, 1000:sent_1000, 1500:sent_1500, 4900:sent_4900}
for key,value in dict_randomreview.items():
    value = df_deduplicated['Text'].values[key]
    value = re.sub(r"http\S+", "", value)
    soup = BeautifulSoup(value, 'lxml')
    dict_randomreview[key] = soup.get_text()
    #print(text)
    print(dict_randomreview[key])
    print('='*50)

### Expanding Contractions
> https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python/47091490#47091490

In [None]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
# https://stackoverflow.com/a/47091490/4084039
import re    
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# def decontracted(phrase):
#     for key,value in contractions.items():
#         phrase = re.sub(key,value,phrase)
#         return phrase

In [None]:
for k,v in dict_randomreview.items():
    print(k,v)

In [None]:
#dict_randomreview = {0:sent_0, 1000:sent_1000, 1500:sent_1500, 4900:sent_4900}
sentence_1500 = decontracted(dict_randomreview[1500])
print(sentence_1500)
print("="*100)

### Removing words that contain numbers in them: 
> https://stackoverflow.com/a/18082370/4084039

In [None]:
#dict_randomreview = {0:sent_0, 1000:sent_1000, 1500:sent_1500, 4900:sent_4900}
sentence_0 = decontracted(dict_randomreview[0])
sentence_0 = re.sub("\S*\d\S*", "", sentence_0).strip()
print(sentence_0)

### Removing all Special characters 
> https://stackoverflow.com/a/5843547/4084039

In [None]:
sentence_1500 = re.sub('[^A-Za-z0-9]+', ' ', decontracted(dict_randomreview[1500]))
print(sentence_1500)

### Removing all the stop words like: 'no', 'nor', 'not', 'the', 'you', ....
>  https://gist.github.com/sebleier/554280

In [None]:
from nltk.corpus import stopwords
print(stopwords.words("english"))

In [None]:
stopwords_list = set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those',
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
            'won', "won't", 'wouldn', "wouldn't"])

### Applying all the above cleaning to the entire dataframe

In [None]:
from tqdm import tqdm
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(df_deduplicated['Text'].values):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords_list)
    preprocessed_reviews.append(sentance.strip())
    
print(preprocessed_reviews[1500])

## Feature Engineering on Text data 

In [None]:
[w for w in dir(sklearn.feature_extraction.text) if not w.startswith('_')]

### (BoW) Bag of Words - simply put is ----------> df.words.value_counts()
> https://en.wikipedia.org/wiki/Bag-of-words_model <br/>
> https://stackabuse.com/python-for-nlp-creating-bag-of-words-model-from-scratch/ <br/>
> http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.453.5924&rep=rep1&type=pdf <br/>
> https://www.youtube.com/watch?v=IRKDrrzh4dE <br/>
> https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html <br/>
> <br/>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer().fit(preprocessed_reviews)
print("some random words/features : ", count_vect.get_feature_names()[:10])
print('='*50)

word_count = count_vect.transform(preprocessed_reviews)
print("the type of count vectorizer ",type(word_count))
print("the shape of out text BOW vectorizer ",word_count.get_shape())
print("the number of unique words ", word_count.get_shape()[1])

> Bag of Words is Unigram based(only 1 word) and hence discards Sequential information of the data. 

### Bi-grams and n-Grams simply put is ----------> Convert 2(or n) sequential words into one word (vector representation)
> https://kavita-ganesan.com/what-are-n-grams/ <br/>
> https://en.wikipedia.org/wiki/Bigram <br/>
> https://en.wikipedia.org/wiki/N-gram <br/>
> https://web.stanford.edu/~jurafsky/slp3/3.pdf <br/>
> http://l2r.cs.uiuc.edu/~danr/Teaching/CS598-05/Papers/Church-ngrams.pdf <br/>
> https://lagunita.stanford.edu/c4x/Engineering/CS-224N/asset/slp4.pdf <br/>
> https://people.cs.umass.edu/~mccallum/papers/tng-icdm07.pdf <br/>
> https://catalog.ldc.upenn.edu/LDC2006T13 <br/>
> https://www.youtube.com/watch?v=E_mN90TYnlg <br/>
> https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer <br/>
> <br/>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
CountVectorizer()

#### note the arguements ngram_range, min_df, max_df
> ngram_range --> Strictly Unigram/Bigram/Trigram, all 3 unigram+bigram+trigram included, ... --> (1,1);(2,2);(3,3);(1,3) <br/>
> min_df --> Minimum Word Count allowed (threshold) <br/>
> max_df --> Maximum Word Count allowed (threshold) <br/>

In [None]:
count_vect = CountVectorizer(ngram_range=(1,2), min_df=10, max_df=5000)
bigram_wordcounts = count_vect.fit_transform(preprocessed_reviews)
print("the type of count vectorizer ",type(bigram_wordcounts))
print("the shape of out text BOW vectorizer ",bigram_wordcounts.get_shape())
print("the number of unique words including both unigrams and bigrams ", bigram_wordcounts.get_shape()[1])

### tf-idf (Term frequency - inverse document frequency)

> https://en.wikipedia.org/wiki/Tf%E2%80%93idf <br/>
> https://www.kdnuggets.com/2018/08/wtf-tf-idf.html <br/>
> https://www.researchgate.net/publication/220387577_A_probabilistic_justification_for_using_tfidf_term_weighting_in_information_retrieval <br/>
> https://ccc.inaoep.mx/~villasen/index_archivos/cursoTL/articulos/Aizawa-tf-idfMeasures.pdf <br/>
> https://www.scss.tcd.ie/khurshid.ahmad/Research/Sentiments/tfidf_relevance.pdf <br/>
> https://www.semanticscholar.org/topic/Tf%E2%80%93idf/72426 <br/>
> http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.438.2284&rep=rep1&type=pdf <br/>
> http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.121.1424&rep=rep1&type=pdf <br/>
> http://ecsjournal.org/Archive/Volume42/Issue3/5.pdf <br/>
> https://www.youtube.com/watch?v=6HuKFh0BatQ <br/>
> https://www.youtube.com/watch?v=C25txE_dq90 <br/>
>  <br/>

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
for i in [TfidfTransformer, TfidfVectorizer]:
    print([w for w in dir(i) if not w.startswith('_')])
    print('='*50)

In [None]:
import inspect
print(inspect.getargspec(TfidfVectorizer))
print('='*50)
print(inspect.getargspec(TfidfTransformer))

In [None]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf_vect.fit(preprocessed_reviews)
print("some sample features ",tf_idf_vect.get_feature_names()[0:10])

In [None]:
tf_idf_forinput = tf_idf_vect.transform(preprocessed_reviews)
print("the type of count vectorizer ",type(tf_idf_forinput))
print("the shape of out text TFIDF vectorizer ",tf_idf_forinput.get_shape())
print("the number of unique words including both unigrams and bigrams ", tf_idf_forinput.get_shape()[1])

> tf-idf still doesn't take synonyms/ almost similar words into considerations eg: tasty = delicious, cheap = affordable

### Word2Vec

> https://en.wikipedia.org/wiki/Word2vec <br/>
> https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf <br/>
> https://arxiv.org/pdf/1301.3781.pdf <br/>
> https://www.researchgate.net/publication/281812760_TwoToo_Simple_Adaptations_of_Word2Vec_for_Syntax_Problems <br/>
> http://jalammar.github.io/illustrated-word2vec/ <br/>
> https://www.researchgate.net/publication/321709086_How_Does_Word2Vec_Work <br/>
> https://arxiv.org/vc/arxiv/papers/1603/1603.04259v2.pdf <br/>
> https://www.academia.edu/33141616/Novel2Vec_Characterising_19th_Century_Fiction_via_Word_Embeddings <br/>
> https://arxiv.org/pdf/1310.4546.pdf <br/>
> https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/rvecs.pdf <br/>
> https://en.wikipedia.org/wiki/Vector_quantization <br/>
> http://www.ws.binghamton.edu/fowler/fowler%20personal%20page/EE523_files/Ch_10_1%20VQ%20Description%20(PPT).pdf <br/>
> https://www.youtube.com/watch?v=5PL0TmQhItY <br/>
> https://www.youtube.com/watch?v=ERibwqs9p38 <br/>
> https://www.tensorflow.org/tutorials/text/word_embeddings <br/>
> <br/>

In [None]:
list_of_sentence = []
for sentence in tqdm(preprocessed_reviews):
    list_of_sentence.append(sentence.split())

### Training your own Word2Vec

In [None]:
import gensim
print([w for w in dir(gensim.models) if not w.startswith('_')])

In [None]:
from gensim.models import Word2Vec
print(inspect.getargspec(Word2Vec))

In [None]:
word2vec_model = Word2Vec(list_of_sentence, min_count=5, size=50)

In [None]:
print([w for w in dir(word2vec_model) if not w.startswith('_')])

In [None]:
print([w for w in dir(word2vec_model.wv) if not w.startswith('_')])

In [None]:
word2vec_words = list(word2vec_model.wv.vocab)
print("number of words that occured minimum 5 times ",len(word2vec_words))
print("sample words ", word2vec_words[0:50])

### Using Google's New Vector Word2Vec
> https://radimrehurek.com/gensim/models/keyedvectors.html

In [None]:
# from gensim.models import KeyedVectors
# print(inspect.getargspec(KeyedVectors))
# print([w for w in dir(KeyedVectors) if not w.startswith('_')])

In [None]:
# filepath = '/kaggle/input/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

# embeddings_index = {}
# Google_Word2Vec = KeyedVectors.load_word2vec_format(filepath, binary=True)
# print([w for w in dir(Google_Word2Vec)])

In [None]:
# for word,vector in zip(Google_Word2Vec.vocab, Google_Word2Vec.vectors):
#     coefs = np.asarray(vector, dtype='float32')
#     embeddings_index[word] = coefs
# Google_word2vec_words = list(Google_Word2Vec.wv.vocab)

In [None]:
# test_word = 'Movie'
# print(len(embeddings_index[test_word]))
# embeddings_index[test_word]

### Compare results of Our Own Word2Vec trained on input data vs. Google News Word2Vec Example 1

### Our Own Word2Vec 

In [None]:
word2vec_model.wv.most_similar('great')

### Google's Word2Vec 

In [None]:
# Google_Word2Vec.wv.most_similar('great')

### Compare results of Our Own Word2Vec trained on input data vs. Google News Word2Vec Example 2

### Our Own Word2Vec 

In [None]:
word2vec_model.wv.most_similar('worst')

### Google's Word2Vec 

In [None]:
# Google_Word2Vec.wv.most_similar('worst')

### Converting Reviews/Sequence of Words into Vector using Average Word2Vec

In [None]:
sent_vectors = [];
for sent in tqdm(list_of_sentence): # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in word2vec_words:
            vec = word2vec_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

In [None]:
# Google_sent_vectors = [];
# for sent in tqdm(list_of_sentence): # for each review/sentence
#     sent_vec = np.zeros(300) # as word vectors are of zero length 300 for google's w2v
#     cnt_words =0; 
#     for word in sent:
#         if word in Google_word2vec_words:
#             vec = Google_Word2Vec.wv[word]
#             sent_vec += vec
#             cnt_words += 1
#     if cnt_words != 0:
#         sent_vec /= cnt_words
#     Google_sent_vectors.append(sent_vec)
# print(len(Google_sent_vectors))
# print(len(Google_sent_vectors[0]))

# ML Modelling Phase

### This is just the first draft version. Will include some more of my own code with lots of updates in coming weeks

In [None]:
# import pandas as pd
# Reviews = pd.read_csv("../input/amazon-fine-food-reviews/Reviews.csv")