# Text Analysis

#### Dataset : Women's E-Commerce Clothing Reviews

### 1. Pre-Processing of Data
   * Remove all extra charecters such as punctuations, non charecters, etc
   * Tokenisation
   * Lametisation of data. (preffered over stemming as stemming can corrupt data in some cases)

In [None]:
# Importing Required Variables
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import sys,math, copy, time
import re
import csv
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
from nltk.stem import WordNetLemmatizer
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Reading the Data
clothing_review = pd.read_csv("../input/Womens Clothing E-Commerce Reviews.csv")
clothing_review = clothing_review.dropna(subset=['Review Text'])
clothing_review = clothing_review[clothing_review['Clothing ID'] == 862]

#Getting Keywords
keyWords = [ "dress","pretty"]

# Clearing the data from extra characters
data = []
for i in clothing_review["Review Text"]:
    j = i.lower()
    j = re.sub(r'[^A-Za-z ]', '', j)
    data.append(j)
    
# Tokenising the data
tokenizer = RegexpTokenizer(r'\w+')
for i in range(len(data)) :
    data[i] = tokenizer.tokenize(data[i])

# Getting the list of stop words
stopWords = list(stopwords.words('english'))
stopWords = [re.sub(r'[^A-Za-z ]', '', j) for j in stopWords]

# Lemmatizing and removing stop words
wordnet_lemmatizer = WordNetLemmatizer()
dataFiltered = []
for each_review in data :
    temp = []
    for word in each_review : 
        if not word in stopWords :
            temp.append(wordnet_lemmatizer.lemmatize(word))
    dataFiltered.append(temp)


dataFiltered.append(keyWords)

# Creating the word list
wordList = np.array(dataFiltered)
wordList = np.hstack(wordList)
wordList = list(set(wordList))
wordList.sort()
number_of_reviews = len(dataFiltered)
wordListIndex = { wordList[i]: i for i in range(len(wordList))}
nDocsPerWord = {i : 0 for i in wordList}





### 2. Creation of TF matrix

In [None]:
tf = np.zeros(shape=(number_of_reviews,len(wordList)))
te = np.zeros(shape=(number_of_reviews,len(wordList)))

for i in range(len(dataFiltered)):
    this_doc_accounted = []
    for j in dataFiltered[i] :
        tf[i][wordListIndex[j]] += 1
        te[i][wordListIndex[j]] = 1
        if not j in this_doc_accounted :
            this_doc_accounted.append(j)
            nDocsPerWord[j] += 1
            


### 3. Creation of TF-IDF matrix from calculated TF matrix

In [None]:
tfIdf = copy.deepcopy(tf)

for i in range(number_of_reviews) :
    for k in dataFiltered[i]:
        j = wordListIndex[k]
        if tfIdf[i][j] != 0 :
            tfIdf[i][j] = tfIdf[i][j]*math.log(number_of_reviews/nDocsPerWord[wordList[j]])

print(tfIdf.shape)

k = 20
sum1 = te.sum(axis=0)
print(sum1.shape)
to_del = []
for i in range(len(sum1)) :
    if sum1[i] < k :
        to_del.append(i)
te = np.delete(te, to_del, axis = 1)
print(te.shape)


sum1 = tf.sum(axis=0)
print(sum1.shape)
to_del = []
for i in range(len(sum1)) :
    if sum1[i] < k :
        to_del.append(i)
tf = np.delete(tf, to_del, axis = 1)
print(tf.shape)

sum1 = tfIdf.sum(axis=0)
print(sum1.shape)
to_del = []
for i in range(len(sum1)) :
    if sum1[i] < k :
        to_del.append(i)
tfIdf = np.delete(tfIdf, to_del, axis = 1)
print(tfIdf.shape)


with open("te.dat",'w') as writefile :
    for i in te :
        for j in i :
#             print(j,end="\t")
            writefile.write(str(j) + "\t")
        writefile.write("\n")
#         print()
with open("tf.dat",'w') as writefile :
    for i in tf :
        for j in i :
#             print(j,end="\t")
            writefile.write(str(j) + "\t")
        writefile.write("\n")
#         print()
with open("tfIdf.dat",'w') as writefile :
    for i in tfIdf :
        for j in i :
#             print(j,end="\t")
            writefile.write(str(j) + "\t")
        writefile.write("\n")
#         print()

### 4. Information Retrieval

* ####  From TF-IDF matrix

#Information retrieval

# query = np.zeros(len(wordList))
# for i in keyWords :
#     query[wordListIndex[i]] = math.log(number_of_reviews/ 1 if nDocsPerWord[i]==0 else nDocsPerWord[i])


tfidf_start = time.time()
query = tfIdf[-1]
query_result = [ cosine(i,query) for i in tfIdf]
tfidf_end = time.time()
query_result.pop()

# print(query_result)

max_iter = 5
for i in range(max_iter) :
    min_value = np.partition(query_result, 4)[i]
    min_val_index = query_result.index(min_value)
    print(i,") Cosine Value :",min_value,"\n", clothing_review["Review Text"].iloc[ min_val_index],"\n")

* #### From LSA using TF matrix

tf_matrix = tf # D x V matrix 
A = tf_matrix.T 

U, s, V = np.linalg.svd(A, full_matrices=1, compute_uv=1)

K =  len(keyWords) # number of components

A_reduced = np.dot(U[:,:K], np.dot(np.diag(s[:K]), V[:K, :])) # D x V matrix 

docs_rep = np.dot(np.diag(s[:K]), V[:K, :]).T # D x K matrix 
terms_rep = np.dot(U[:,:K], np.diag(s[:K])) # V x K matrix 

# print (A_reduced)
# print (docs_rep)
# print (terms_rep)

key_word_indices = [wordList.index(key_word) for key_word in keyWords] # vocabulary indices 

key_words_rep = terms_rep[key_word_indices,:]     
query_rep = np.sum(key_words_rep, axis = 0)


svd_start = time.time()
query_doc_cos_dist = [cosine(query_rep, doc_rep) for doc_rep in docs_rep]
svd_end = time.time()
query_doc_sort_index = np.argsort(np.array(query_doc_cos_dist))


max_iter = 5
for rank, sort_index in enumerate(query_doc_sort_index):
    print(rank + 1, ") Cosine value : ", float(query_doc_cos_dist[sort_index]) ,"\n", clothing_review["Review Text"].iloc[sort_index],"\n")
    max_iter -= 1
    if max_iter == 0 :
        break

### 5. Plotting

%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(docs_rep[:,0], docs_rep[:,1], c=query_doc_cos_dist) # all documents 
plt.scatter(query_rep[0], query_rep[1], marker='+', c='red') # the query 
plt.xlabel("Component 1")
plt.ylabel("Component 2")

### 6. Analysis of Both Methods

#### Results 
    For Input : Dress Pretty

* TF-IDF :

 1. I have two of these tops. the maroon and tan and white stripe. the material is great and the neckline lays great. it isn't too thick so i can wear it on cooler florida days. you can dress it up under a jacket or dress down with some jeans. 
 1. I love this top! the unique neck detail is awesome! i dress it up; i dress it casual. works for every adventure. more!!! 
 1. I got the red- it is a great color. this is one of those thin soft t shirts. it drapes really nicely and dresses up or dresses down. the only downside for me is that it takes a little extra effort in the washing department, but really not much- just have to line dry. 
 1. I bought this shirt in the neutral and white and love it. so many people compliment it. i usually pair it with white pants and cute wedges to dress it up (obviously with a statement necklace too). but the greatest thing about this shirt is the fact that you can also dress it down. the material is fabulous but i have not washed it yet so i am not sure if it is going to shrink (which i hope it doesn't because it is one of my favorite pieces).  unlike the other reviewer i did not think that the 
 1. I bought three of these tees. i kept the turquoise one only though. the color was pretty - a little drab but not too bad.  i love the style., the fabric is so soft and the one i wore was super comfy. the sleeves are cute without being too dramatic (like the ruffle sleeve tee - the sleeves were kind of ridiculous on me) now - the cons - the colors are pretty drab. i normally look better in more vibrant colors. the "red" i bought made me look like a corpse. it's not pretty at all. can't re 
 

* SVD using TF matrix
 
 1. Very comfortable shirt, light weight top with lovely floral colors. great for spring/summer with white jeans. love everything one september makes! 
 1. Great, comfortable tank. cool added detail in the straps. runs big so size down. i wear both a medium and large at retailer and got this in a small. 
 1. I usually wear small, ordered xs and it's still way too large. quality not that great, just meh, returning it. 
 1. I bought this shirt with the intentions of using it as a nursing top. to that effect, i'm happy with the purchase, and will get a lot of use out of it. i bought two on blush and one blue at the same time. however, i was surprised that the shirt was so open and loose. i thought it was just a deep v-neck, but the design actually really opens up once the hook and eye is undone at the top. you are then very exposed, there is nothing but an opened front shirt. this is convenient for a nursing mother, 
 1. Ordered online, just received this vest, so disappointed! love the garment but it is way too small. i wear a medium in most clothing and size 6 in a dress, but the medium in this vest isn't just snug it is definitely too small, even worn alone, i can't button it. i am now ordering the large and keeping my fingers crossed as i want to wear a tee under it. hoping i won't need to return both the m and the l, thinking of ordering an xl.

#### Time

   From the following code we can see that the SVD takes less time than TFIDF

print("TFIDF : ",tfidf_end - tfidf_start)
print("SVD with TF : ",svd_end - svd_start)

#### Space

   LSA takes less space than TFIDF. As in TFIDF we have to store an 'n x m' matrix ( n = number of documents, m = number of words). Whereas in LSA we just have to store the 'n x k' matrix, ( k = the reduced dimentionality ), and the Eigen matrix '' .

#### Comparing the output behaviour

The Following code compares the cosine output from both the methods. We can see that LSA curve has more gradual change i.e. it is able to identify similarities that the TF-IDF was not able to do[](http://)

query_result.sort()
plt.plot(range(len(query_result)),query_result, "-o",label="TFIDF")
query_doc_cos_dist.sort()
plt.plot(range(len(query_doc_cos_dist)),query_doc_cos_dist, "-o", label="SVD with TF")
plt.legend(loc='best')
plt.ylabel("Cosine output")
plt.xlabel("Rank")
plt.show()
print()

* Curvature 
 * Observation : We can see that LSA curve has more gradual change as compared to TFIDF which gets saturated after a point.
 * Inference : TFIDF looses ability to identify similarities after some extent (Theoretically, TFIDF can only tell the similarities in those documents only where the words appear.)
* Position of each curve
 * Observation : The SVD curve lies more flat towards the origin i.e. it has more values closer to 0.
 * Inference : The SVD algorithm identifies each document to be more similar to the input query as compared to TFIDF. 
 

### Conclusion

 
In terms of computational resources and time, SVD is better than TFIDF.
In terms of output given each of the above method is good in its own way.
 
 TF-IDF method only gives those documents where the word is appearing. This is good for people who are searching for keywords and want them to be present in the document regardless of what the document means.
 
 Whereas in SVD-TF, apart from those documents where both the word comes, those documents are also given which bears the same meaning as the keyword. SVD grasps the meaning of the document. This is good for people who are searching for texts which bear same meaning as the keyword instead of presence of just the keywords.