# Loading the data


In [None]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics 
from sklearn.metrics import roc_curve,auc
from nltk.stem.porter import PorterStemmer



In [None]:
# connection to the database
con = sqlite3.connect('amazon_data/database.sqlite')

# filtering psitive and negative reviews
filtered_data = pd.read_sql_query("""
SELECT * FROM Reviews WHERE Score != 3
""",con)

In [None]:
filtered_data

In [None]:
# seperate positive and negative reviews 
def partition(x):
    if x<3:
        return 'negative'
    return 'positive'


In [None]:
filtered_data.shape

In [None]:
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition)
filtered_data['Score'] = positiveNegative
# filtered_data['Score']

In [None]:
filtered_data.head()

# Data cleaning: deduplication

In [None]:
display = pd.read_sql_query("""
SELECT * FROM Reviews WHERE Score != 3 AND UserId ="AR5J8UI46CURR"
ORDER BY ProductID
""",con)
display

In [None]:
# first sorting data according to productid in ascdending order
sorted_data = filtered_data.sort_values('ProductId',axis=0,ascending=True)


In [None]:
# remove duplicate entries 
final = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},keep='first',inplace=False)
final.shape

In [None]:
# Checking what % of data still remains 
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

In [None]:
# some data have helpfulnessdenominator less than helpfulnessnumerator
display = pd.read_sql_query("""
SELECT * FROM Reviews WHERE Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""",con)
display

In [None]:
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]

In [None]:
print(final.shape)

In [None]:
final['Score'].value_counts()

# Bag of words

In [None]:
count_vect = CountVectorizer()
final_counts = count_vect.fit_transform(final['Text'].values)

In [None]:
type(final_counts)

In [None]:
final_counts.get_shape()

# Text preprocessing (Stemming, stopword removal and lemmatization)

In [None]:
# find sentences containing HTML pages
import re
i =0 
for sent in final['Text'].values:
    if(len(re.findall('<.*?>',sent))):
        print(i)
        print(sent)
        break;
    i+=1;

In [None]:
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('stopwords')

In [None]:
stop = set(stopwords.words('english'))#set of stopwords
stop

In [None]:
# initializing a snowball stemmer
sno = nltk.stem.SnowballStemmer('english')
sno

In [None]:
# function to clean the word of any html tags
def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr,' ',sentence)
    return cleantext

In [None]:
# clean word of any puctuations
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)    
    cleaned = re.sub(r'[.|,|)|(|\|/]',r'',cleaned)
    return cleaned

In [None]:
print(sno.stem('tasty'))

In [None]:
# code for implementing step by step checks
i=0
str1=' '
final_string=[]
all_positive_words=[]
all_negative_words=[]
s=''
for sent in final['Text'].values:
    filtered_sentence=[]
    for w in cleanhtml(sent).split():
        for cleaned_words in cleanpunc(w).split():
            print(cleaned_words)
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
                s = (sno.stem(cleaned_words.lower())).encode('utf8')
                filtered_sentence.append(s)
                if(final['Score'].values)[i] == 'positive':
                    all_positive_words.append(s)
                if(final['Score'].values)[i] == 'negative':
                    all_negative_words.append(s)
            else:
                continue
        else:
            continue
    str1= b" ".join(filtered_sentence)
    final_string.append(str1)
    i+=1
        
        
        
        
        

In [None]:
final['CleanedText'] = final_string #adding column of cleaned text

In [None]:
final.head(3)

In [None]:
# store the table into a sqlite database table for future
conn = sqlite3.connect('final.sqlite')
c= conn.cursor()
conn.text_factory = str
final.to_sql('Reviews',conn,flavor-None,schema=None,if_exists='replace')

# bigrams ngrams 

In [None]:
freq_dist_positive=nltk.FreqDist(all_positive_words)
freq_dist_negative=nltk.FreqDist(all_negative_words)
print("Most common frequent words:",freq_dist_positive.most_common(20))
print("Most common negative words:",freq_dist_negative.most_common(20))

In [None]:
# removing stop words like not should be avoided before building ngrams
count_vect = CountVectorizer(ngram_range(1,2))
final_bigram_counts = count_vect.fit_transform(final['Text'].values)

In [None]:
final_bigram_counts.get_shape() 

# tf-idf

In [None]:
tf_idf_vect = TfidfVectorizer(n_gram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(final['Text'].values)

In [None]:
final_tf_idf.shape()

In [None]:
features = tf_idf_vect.get_feature_names()
len(features)

In [None]:
features[100000:100010]

In [None]:
print(final_tf_idf[3,:].toarray()[0])

In [None]:
def top_tf_idf_feats(row,features,top_n=25):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i],row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature','tfidf']
    return df

top_tfidf = top_tf_idf_feats(final_tf_idf[1,:].toarray()[0],features,25)

In [None]:
top_tfidf