# CS-386 Artificial Intelligence Lab

# Project - 3 : Mining E-Commerce Customer Reviews

# Shivam Pandey (160010003)

### Question 1: Preprocess the corpus of customer reviews dataset

In [3]:
#-----------------------------------------------------------------------------------------------------
            ## Importing all necessary python libraries
#-----------------------------------------------------------------------------------------------------
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine
import os
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
import time


#-----------------------------------------------------------------------------------------------------
            ## Pre-processing entails "Removing Punctuations", "Lower-Casing"
#-----------------------------------------------------------------------------------------------------
Clothing_ID = 862

corpus_pandas_dataFrame = pd.read_csv("../input/Womens Clothing E-Commerce Reviews.csv")
corpus_with_particular_ID = corpus_pandas_dataFrame[corpus_pandas_dataFrame["Clothing ID"] == Clothing_ID]
#corpus_with_particular_ID = corpus_pandas_dataFrame.dropna(subset=['Review Text'])
corpus_with_particular_ID = corpus_with_particular_ID.dropna(subset=['Review Text'])
review_text = corpus_with_particular_ID["Review Text"]

corpus = list() # This will be a list of strings, where each string will be a "document"

## Pre-processing Corpus
for each_review in review_text:
    preprocessed_review = each_review.lower() # Lower-casing each document
    preprocessed_review = re.sub(r'[^A-Za-z ]', '', preprocessed_review) # removing Punctuations from each document
    corpus.append(preprocessed_review)

print(corpus) # print the pre-processed corpus


FileNotFoundError: File b'../input/Womens Clothing E-Commerce Reviews.csv' does not exist

### Question 2: Remove stopwords, standardize tokens

In [None]:
#-----------------------------------------------------------------------------------------------------
                ## Stemming and Lemmatization - Applying to the pre-processed corpus
#-----------------------------------------------------------------------------------------------------
lemmatizer = WordNetLemmatizer()
for each_document_index in range(len(corpus)):
    list_of_words = corpus[each_document_index].strip().split()
    
    for each_word_index in range(len(list_of_words)) :
        list_of_words[each_word_index] = lemmatizer.lemmatize(list_of_words[each_word_index])
    
    corpus[each_document_index] = ' '.join(list_of_words)

    
#-----------------------------------------------------------------------------------------------------
                                # STOPWORDS Removal from the Corpus #
#-----------------------------------------------------------------------------------------------------
stopWords = stopwords.words('english')

## Pre-processing Stopwords
for stopWord_index in range(len(stopWords)):
    stopWords[stopWord_index] = stopWords[stopWord_index].lower() # Lower-casing
    stopWords[stopWord_index] = re.sub(r'[^A-Za-z ]','',stopWords[stopWord_index]) # removing Punctuations

vectorizer = CountVectorizer(min_df=0, stop_words=stopWords, strip_accents='ascii')


#-----------------------------------------------------------------------------------------------------
                        ## Building the Vocabulary out of the corpus 
#-----------------------------------------------------------------------------------------------------
docs_tf = vectorizer.fit_transform(corpus)
vocabulary_terms = vectorizer.get_feature_names()
print(vocabulary_terms) # Its a list of "vocabulary words" developed from the corpus

key_words = ['lovely','top']

### Question 3: Build the Term-Frequency Inverse-Document-Frequency (TF-IDF) matrix and apply the Latent Semantic Analysis (LSA) method

In [None]:

#-------------------------------------------------------------------------------------------------------
        ## Building the Term-Frequency Inverse-Document-Frequency (TF-IDF) matrix and performaing IR
#-------------------------------------------------------------------------------------------------------
# Composite model for both the corpus and the query 
docs_query_tf = vectorizer.transform(corpus + [' '.join(key_words)]) 
transformer = TfidfTransformer(smooth_idf = False)
tfidf = transformer.fit_transform(docs_query_tf.toarray())

# D(no. of documents) x V(cardinality of vocabulary set) document-term matrix
tfidf_matrix = tfidf.toarray()[:-1] # Excluding the last column which contains the 
print(len(tfidf_matrix))
# 1 x V query-term vector 
query_tfidf = tfidf.toarray()[-1] 

#print (tfidf_matrix)
#print (query_tfidf)

TFIDF_start_time = time.time()
query_doc_tfidf_cos_dist = [cosine(query_tfidf, doc_tfidf) for doc_tfidf in tfidf_matrix]
query_doc_tfidf_sort_index = np.argsort(np.array(query_doc_tfidf_cos_dist))
TFIDF_end_time = time.time()

#print(query_doc_tfidf_sort_index)

print("#-------------------------------------------------------------------------------------------------------\n\t## Building the Term-Frequency Inverse-Document-Frequency (TF-IDF) matrix and performaing IR\n#-------------------------------------------------------------------------------------------------------")
for rank, sort_index in enumerate(query_doc_tfidf_sort_index):
    print (rank, query_doc_tfidf_cos_dist[sort_index], corpus[sort_index])



#-----------------------------------------------------------------------------------------------------
                ## Performing IR using LSA with SVD on TF-matrix
#-----------------------------------------------------------------------------------------------------
tf_matrix = docs_tf.toarray() # D x V matrix 
A = tf_matrix.T 

U, s, V = np.linalg.svd(A, full_matrices=1, compute_uv=1)

K = 2 # number of components

A_reduced = np.dot(U[:,:K], np.dot(np.diag(s[:K]), V[:K, :])) # V x D matrix (Reconstruction Matrix)
docs_rep = np.dot(np.diag(s[:K]), V[:K, :]).T # D x K matrix 
terms_rep = np.dot(U[:,:K], np.diag(s[:K])) # V x K matrix 

key_word_indices = [vocabulary_terms.index(key_word) for key_word in key_words] # vocabulary indices 
key_words_rep = terms_rep[key_word_indices,:]
query_rep = np.sum(key_words_rep, axis = 0)
#print (query_rep)

TF_LSA_start_time = time.time()
query_doc_cos_dist = [cosine(query_rep, doc_rep) for doc_rep in docs_rep]
query_doc_sort_index = np.argsort(np.array(query_doc_cos_dist))
TF_LSA_end_time = time.time()

print("#-----------------------------------------------------------------------------------------------------\n\t## Performing IR using LSA with SVD on TF-matrix\n#-----------------------------------------------------------------------------------------------------")
for rank, sort_index in enumerate(query_doc_sort_index):
    print (rank, query_doc_cos_dist[sort_index], corpus[sort_index])

%matplotlib inline
import matplotlib.pyplot as plt
plt.plot()
plt.scatter(docs_rep[:,0], docs_rep[:,1], c=query_doc_cos_dist) # all documents 
plt.scatter(query_rep[0], query_rep[1], marker='+', c='red') # the query 
plt.xlabel("Component 1")
plt.ylabel("Component 2")

print("TFIDF Running Time :",TFIDF_end_time-TFIDF_start_time,"\nLSA on TF-matrix running time :",TF_LSA_end_time-TF_LSA_start_time,"\n")