<a href="https://colab.research.google.com/github/poziryna84/Sentiment_Analysis_LR/blob/master/Comments_cosine_similarity_TF_IDF_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer  
from nltk.tokenize import TweetTokenizer 
import string
import math

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Loading and Viewing the data.

In [4]:
data = pd.read_csv('/content/drive/MyDrive/delectateam-nlptrainingexam-aa9ea86fa479/resources/vectorization/corpus.csv', header=None)

In [5]:
data.columns = ['text']

In [6]:
data.shape

(22998, 1)

In [7]:
data.head()

Unnamed: 0,text
0,I did not stay at the hotel but I was horribly...
1,20 mins seated before I got a menu and I was t...
2,The food was excellent but tthe service was te...
3,Although the food was great the staff was awfu...
4,Fantastic pastry shop on La Via Rambla in Barc...


# Functions.

In [9]:
def prepro_text(text):
    '''
    A function that does tokenizing, lowercasing, removing stop words and 
    punctuation and stems a string.
    '''
    stopwords_english = stopwords.words('english')
    stemmer = PorterStemmer() 

    # instantiate tokenizer class
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                                   reduce_len=True)

    # tokenize tweets
    string_tokens = tokenizer.tokenize(text)

    string_clean = []

    # remove stop words and punctuation
    for word in string_tokens:
        if (word not in stopwords_english and
            word not in string.punctuation):
            string_clean.append(word)

    # stemming the tokens
    string_stem = []
    for word in string_clean:
        stem_word = stemmer.stem(word) 
        string_stem.append(stem_word) 
        
    return string_stem

def counter(list_of_toks):
  dic = {}

  for i in list_of_toks:
    if i in dic:
      dic[i] += 1
    else:
      dic[i] = 1
  return dic

  
def tfScore(dic):
  '''Function that converts dictionary of the term frequency per document into 
  tf score dictionary by dividing each count of a term in document by the total 
  number of terms in the document.
  '''
  num_words = sum(dic.values())
  new_dic = {}
  for k in dic:
    new_dic[k] = dic[k]/ num_words
    #new_dic[k] = 1 + np.log(dic[k]/ num_words)
  
  return new_dic

def multValDict(d1, d2):
  '''A function that multiplies the values of the correspondng keys of two 
  dictionaries.'''
  d_new = {}
  for k in d1:
    d_new[k] = d1[k] * d2[k]
  
  return d_new

def tfIdfExtract(lista):
  #create a dictionary of unique terms as keys with the number of documents  
  #where the term if found as their values
  df = {}
  #create an empty dictionary of the term frequency per document:
  per_doc_freqs = {}
  
  for i in range(len(lista)):
    #create d - document (set of pre-proccessed tokens)
    d = prepro_text(lista[i])
    
    #add the index of the document as its key and its value as the dictionary of
    #the words/tokens frequencies within the document to the term frequency per
    #document dictionary
    if len(counter(d)) > 0:
      per_doc_freqs[i] = counter(d)

    for tok in d:
      if tok in df:
        df[tok] += 1
      else:
        df[tok] = 1
   
  #create Tf dictionary by applying tfScore function
  #to each key of the term frequency per document dictionary
  tf = {}

  for k in per_doc_freqs:
    tf[k] = tfScore(per_doc_freqs[k])
    
  #create the number of unique terms across the corpus
  N = len(df)

  #create Inverse Document Frequency dictionary with its keys as unique terms
  #and its values as their inverse document frequency score 
  # (Note that the idf formula above differs from the standard textbook
  # notation that defines the idf as idf(t) = log [ n / (df(t) + 1) ])
  idf_dict = {}
  
  for tok in df.keys():

    # if df[tok] = 1:
    idf = np.log(N / df[tok]) + 1
    # if df[tok] = 2
    #idf = np.log ((1 + N) / (1 + df[tok])) + 1
    
    idf_dict[tok] = idf

  #create TfIdf dictionary by applying multValDict function to Tf and Idf 
  #dictionaries
  tf_idf = {}
  
  for k in tf:
    tf_idf[k] = multValDict(tf[k], idf_dict)

  #create a zero matrix with the number of rows equal to the number of comments
  #and the number of columns equal to the number of unique terms extracted in 
  #df. Fill the matrix with the value of the correspondig  key/term accessing 
  #the corresponding tf_idf dictionary key values

  token_list = list(df.keys())
  token_list.sort()
  total_vocab_size = (len(token_list))

  
  row_num = len(lista)
  
  D = np.zeros((row_num, total_vocab_size))
  for k in tf_idf:
    for token in tf_idf[k]:
      c = token_list.index(token)
      D[k][c] = tf_idf[k][token]
     
  return D, token_list

def cosSim(A, B):
    '''
    The function that takes in two vectors and computes the cosine distance
    Input:
        A: a numpy array which corresponds to a word vector
        B: A numpy array which corresponds to a word vector
    Output:
        cos: numerical number representing the cosine similarity between A and B.
    '''    
    dot = np.dot(A,B)
    norma = np.sqrt(sum(A**2))
    normb = np.sqrt(sum(B**2)) 
    cos = dot/(norma*normb)
    
    return cos

def simTex(com_index, sim_matrix, num_of_comments):
  ''' Function that takes the df index of a (original) comment, similarity 
  matrix and the desired number of similar to the (original) comment comments 
  that the function will return.'''

  v = sim_matrix[com_index]
  v_sims = []
  
  for i in range(len(sim_matrix)):
    v_sims.append(cosSim(sim_matrix[i],v))
  df = pd.DataFrame(list(zip(data['text'], v_sims)))
  df.columns = ['comments', 'scores']
  df = df.sort_values(by=['scores'], ascending=False, ignore_index=True)
  
  print('The original comment: ')
  print('                                                        ')
  print(data.text[com_index])
  print('********************************************************')
  print(f'{num_of_comments} most similar coments and their corresponding scores:')
  print('                                                        ')
  df = df[1:num_of_comments]
  
  for i in df.index:
    print(df['comments'][i], df['scores'][i])
  
  return

# Creating TF-IDF matrix

In [10]:
m, l = tfIdfExtract(list(data['text']))

# 10 most similar comments

In [11]:
simTex(0, m, 10)



The original comment: 
                                                        
I did not stay at the hotel but I was horribly treated in their restaurant.
********************************************************
10 most similar coments and their corresponding scores:
                                                        
HORRIBLE !! 0.6004833781355572
Horrible service. 0.548315088475608
horrible italian restaurant! 0.5122495625052131
Treat yourself! 0.4927120274267418
The place was recommended by the hotel we stayed at. 0.49000413602175535
horrible night.... 0.4757284902759183
It was not a horrible meal but also not as good as other restaurants in Barcelona. 0.4544513264976646
This restaurant is a REAL treat! 0.4084227911612561
Treat yourselves and go to this wonderful restaurant! 0.39377106419447483


In [12]:
simTex(15342, m, 10)



The original comment: 
                                                        
However, the steak with pepper sauce is heavenly.
********************************************************
10 most similar coments and their corresponding scores:
                                                        
However, the steak with pepper sauce is heavenly. 1.0000000000000002
sauce. 0.36587343139169975
I had the pork loin with pepper sauce and it was very good. 0.3554825289747775
All steaks come with three sauces. 0.35471100999714067
I had the sirloin steak in pepper sauce which was excellent and the other three in our party had fish and chicken breast. 0.34908046878886395
The bass (with lentils) was heavenly and the wines were delightful. 0.33319401374803576
The second time i had grilled chicken in pepper sauce which was gorgeous. 0.32288787855546497
Good steak (chuletón) and red peppers 0.32078359150125463
My wife had Solomillo Cerdo Pimieta (steak in green pepper sauce). 0.3158305516860939
