In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
import math
from prettytable import PrettyTable
from scipy import sparse
import csv

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
%%time
con = sqlite3.connect('database.sqlite')
raw_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, con)
raw_data = raw_data.sample(150000, random_state=42)
raw_data = raw_data.sort_values('Time')
print(raw_data.shape)

(150000, 10)
Wall time: 16.6 s


In [3]:
def preprocess(data):
    '''
    Preprocess I/P data :-
    1. Update Score with 0 (-ve Review) and 1 (+ve Review).
    2. Add another feature with Review length and use Log to reduce scale.
    3. Select unique rows based on - UserId, ProfileName, Time and Text.
    4. Remove rows which have helpfulness denominator greater than helpfulness numerator.
    5. Remove html tags from Reviews.
    6. Remove punctuations and special characters.
    7. Performing Stemming using Snowball Stemmer.
    8. Extract Adjectives, Nouns, Verbs, Adverbs from 'Summary' column and add it to the 'CleanedText' column.
    '''
    # Update 'Score' with 0 -> Negative Review and 1 -> Positive Review
    data['Score'] = data['Score'].map(lambda x: 0 if x<3 else 1)
    data['TextLength'] = data['Text'].apply(lambda x: math.log(len(x.split())))

    data = data.drop_duplicates(subset={'UserId', 'ProfileName', 'Time', 'Text'})
    data = data[data['HelpfulnessNumerator'] <= data['HelpfulnessDenominator']]

    stops = set(stopwords.words('english')) - set(['not'])
    snow = SnowballStemmer('english')

    # Function to clean the word of any html-tags
    def cleanhtml(sentence):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, ' ', sentence)
        return cleantext

    # Function to clean the word of any punctuation or special characters
    def cleanpunc(sentence):
        cleaned = re.sub(r'[?|!|\'|"|:|#]',r' ',sentence)
        cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
        return  cleaned

    filtered_reviews = []
    s = ''

    for review in data['Text'].values:
        filtered_sent = []
        review = cleanhtml(review)
        review = cleanpunc(review)
        for word in review.split():
            if word.isalpha() and len(word) > 2:
                if word.lower() not in stops:
                    s = snow.stem(word.lower())
                    filtered_sent.append(s)
        filtered_reviews.append(' '.join(filtered_sent))
        
    tags = []
    for review in data['Summary'].values:
        review = cleanhtml(review)
        review = cleanpunc(review)
        adjs = [x[0] for x in nltk.pos_tag(word_tokenize(review)) if x[1].startswith('JJ') \
                or x[1].startswith('RB') or x[1].startswith('VB') or x[1] == 'NNP']
        if len(adjs) > 0:
            adjs = ' '.join([snow.stem(word.lower()) if len(word) > 2 and word.lower() not in stops else '' for word in adjs])
        tags.append(adjs)
    
    print('Total Data-points :-',len(filtered_reviews))
    data['CleanedText'] = [str(filtered_reviews[i]) + ' ' + str(tags[i]) for i in range(len(tags))]
    return data

In [4]:
def get_data(data_points):
    '''
    Perform pre-processing on raw data and extract required datapoints :: 70-30 Split.
    '''
    split = int(data_points * 0.70)
    data = preprocess(raw_data)
    train = data[:split]
    test = data[split:data_points]

    print(train['Score'].value_counts())
    print(test['Score'].value_counts())
    return train, test

In [5]:
%%time
data_train, data_test = get_data(20000)
del raw_data

Total Data-points :- 124264
1    12473
0     1527
Name: Score, dtype: int64
1    5250
0     750
Name: Score, dtype: int64
Wall time: 3min 21s


In [6]:
%%time
data_train.to_csv('./Matrices/sample_data_train.csv', index=False)
data_test.to_csv('./Matrices/sample_data_test.csv', index=False)

Wall time: 590 ms


In [5]:
%%time
data_train, data_test = get_data(40000)
del raw_data

data_train.to_csv('./Matrices/sample2_data_train.csv', index=False)
data_test.to_csv('./Matrices/sample2_data_test.csv', index=False)

Total Data-points :- 124264
1    24674
0     3326
Name: Score, dtype: int64
1    10304
0     1696
Name: Score, dtype: int64
Wall time: 6min 45s


In [7]:
%%time
cv_model = CountVectorizer(max_features=15000, min_df=5)
bow_counts_train = cv_model.fit_transform(data_train['CleanedText'].values)
bow_counts_test = cv_model.transform(data_test['CleanedText'].values)
print(bow_counts_train.shape, bow_counts_test.shape)

(14000, 4968) (6000, 4968)
Wall time: 1.48 s


In [8]:
sparse.save_npz('./Matrices/sample_bow_train.npz', bow_counts_train)
sparse.save_npz('./Matrices/sample_bow_test.npz', bow_counts_test)

In [9]:
%%time
data_train, data_test = get_data(80000)
del raw_data

Total Data-points :- 124264
1    48338
0     7662
Name: Score, dtype: int64
1    19901
0     4099
Name: Score, dtype: int64
Wall time: 3min 28s


In [10]:
%%time
data_train.to_csv('./Matrices/data_train.csv', index=False)
data_test.to_csv('./Matrices/data_test.csv', index=False)

Wall time: 4.36 s


In [7]:
%%time
cv_model = CountVectorizer(max_features=15000, min_df=5)
bow_counts_train = cv_model.fit_transform(data_train['CleanedText'].values)
bow_counts_test = cv_model.transform(data_test['CleanedText'].values)
print(bow_counts_train.shape, bow_counts_test.shape)

(56000, 9397) (24000, 9397)
Wall time: 3.27 s


In [8]:
sparse.save_npz('./Matrices/bow_train.npz', bow_counts_train)
sparse.save_npz('./Matrices/bow_test.npz', bow_counts_test)

In [9]:
%%time
tfidf_model = TfidfVectorizer(max_features=15000, min_df=5)
tfidf_train = tfidf_model.fit_transform(data_train['CleanedText'].values)
tfidf_test = tfidf_model.transform(data_test['CleanedText'].values)
print(tfidf_train.shape, tfidf_test.shape)

(56000, 9397) (24000, 9397)
Wall time: 3.34 s


In [10]:
sparse.save_npz('./Matrices/tfidf_train.npz', tfidf_train)
sparse.save_npz('./Matrices/tfidf_test.npz', tfidf_test)

In [11]:
%%time
def avg_w2v(data_train, data_test):
    # Process Train Data
    train_list_of_sent=[]
    for sent in data_train['CleanedText'].values:
        train_list_of_sent.append(sent.split())

    # Build W2V model based on Train Data only
    w2v_model = Word2Vec(train_list_of_sent, min_count=5, size=50, workers=4)
    w2v_words = list(w2v_model.wv.vocab)

    sent_vectors_train = []
    for sent in train_list_of_sent:
        sent_vec = np.zeros(50)
        count_words = 0
        for word in sent:
            if word in w2v_words:
                try:
                    vec = w2v_model.wv[word]
                except KeyError:
                    vec = np.zeros(50)
                sent_vec += vec
                count_words += 1
        if count_words != 0:
            sent_vec /= count_words
        sent_vectors_train.append(sent_vec)

    # Process Test Data
    test_list_of_sent=[]
    for sent in data_test['CleanedText'].values:
        test_list_of_sent.append(sent.split())

    sent_vectors_test = []
    for sent in test_list_of_sent:
        sent_vec = np.zeros(50)
        count_words = 0
        for word in sent:
            if word in w2v_words:
                try:
                    vec = w2v_model.wv[word]      # Use W2V model based on Train data to create the text Vectors.
                except KeyError:
                    vec = np.zeros(50)
                sent_vec += vec
                count_words += 1
        if count_words != 0:
            sent_vec /= count_words
        sent_vectors_test.append(sent_vec)
    
    return sent_vectors_train, sent_vectors_test

sent_vectors_train, sent_vectors_test = avg_w2v(data_train, data_test)
print(len(sent_vectors_train), len(sent_vectors_test))

56000 24000
Wall time: 3min 11s


In [12]:
pd.DataFrame(sent_vectors_train).to_csv('./Matrices/avg_w2v_train.csv', index=False)
pd.DataFrame(sent_vectors_test).to_csv('./Matrices/avg_w2v_test.csv', index=False)

In [13]:
%%time
def tfidf_w2v(data_train, data_test):
    tfidf = TfidfVectorizer()
    tfidf_vect_train = tfidf.fit_transform(data_train['CleanedText'].values).toarray()
    tfidf_vect_test = tfidf.transform(data_test['CleanedText'].values).toarray()
    words_dict = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))

    # Process Train Data
    list_of_sent=[]
    for sent in data_train['CleanedText'].values:
        list_of_sent.append(sent.split())
    
    # Build W2V model based on Train Data only
    w2v_model = Word2Vec(list_of_sent, min_count=5, size=50, workers=4)
    w2v_words = list(w2v_model.wv.vocab)

    tfidf_sent_vectors_train = []
    row=0
    for sent in list_of_sent:
        sent_vec = np.zeros(50)
        weighted_sum = 0
        for word in sent:
            if word in w2v_words:
                try:
                    vec = w2v_model.wv[word]
                except KeyError:
                    vec = np.ones(50)
                try:
                    tf_idf = words_dict[word]*(sent.count(word)/len(sent))
                except KeyError:
                    tf_idf = 1.0*(sent.count(word)/len(sent))
                sent_vec += (vec * tf_idf)
                weighted_sum += tf_idf
        if weighted_sum != 0:
            sent_vec /= weighted_sum
        tfidf_sent_vectors_train.append(sent_vec)
        row += 1

    # Process Test Data
    list_of_sent=[]
    for sent in data_test['CleanedText'].values:
        list_of_sent.append(sent.split())

    tfidf_sent_vectors_test = []
    row=0
    for sent in list_of_sent:
        sent_vec = np.zeros(50)
        weighted_sum = 0
        for word in sent:
            if word in w2v_words:
                try:
                    vec = w2v_model.wv[word]       # Use W2V model based on Train data to create the text Vectors.
                except KeyError:
                    vec = np.ones(50)
                try:
                    tf_idf = words_dict[word]*(sent.count(word)/len(sent))
                except KeyError:
                    tf_idf = 1.0*(sent.count(word)/len(sent))
                sent_vec += (vec * tf_idf)
                weighted_sum += tf_idf
        if weighted_sum != 0:
            sent_vec /= weighted_sum
        tfidf_sent_vectors_test.append(sent_vec)
        row += 1
    
    return tfidf_sent_vectors_train, tfidf_sent_vectors_test

tfidf_sent_vectors_train, tfidf_sent_vectors_test = tfidf_w2v(data_train, data_test)
print(len(tfidf_sent_vectors_train), len(tfidf_sent_vectors_test))

56000 24000
Wall time: 3min 36s


In [14]:
pd.DataFrame(tfidf_sent_vectors_train).to_csv('./Matrices/tfidf_w2v_train.csv', index=False)
pd.DataFrame(tfidf_sent_vectors_test).to_csv('./Matrices/tfidf_w2v_test.csv', index=False)