In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### This video covers:


*   Implementation of TF-IDF representation
*   Word2vec representation by gensim library


### part 1: Implementation of TF-IDF representation

![](https://drive.google.com/uc?export=view&id=1F4gXB_YhEKyJrQjtZ5e7UvzapsfhW1lR)


In [None]:
# this function reads the 'Hamshahri' corpus and returns its documents (docs) and document Ids (DIDs)
def read_document(path):

  res = []

  with open(path, 'r', encoding = 'utf-8') as ptr:
    for line in ptr:
      res.append(line)

  tmp = ' '
  docs = []
  DIDs = []
  DIDs.append(res[0].split()[1])

  for i in range(1, len(res)):

    if res[i].startswith(".DID"):
      docs.append(tmp)
      tmp = ''
      DIDs.append(res[i].split()[1])
      continue

    if res[i].startswith(".Date"):
      continue

    if res[i].startswith(".Cat"):
      continue

    tmp = tmp+ ' ' + res[i].strip()

  docs.append(tmp)

  return docs, DIDs

In [None]:
# this function removes the punctuation marks and non-alphabetic words from given document 
# and returns an array of document words
def clean_text(doc):

  tokens = doc.split()
  translation_table = str.maketrans('', '', "><.،؟؛:{}\|+ـ()*&^٪$#❊!/[]=-")
  tokens = [word.translate(translation_table) for word in tokens]
  tokens = [word for word in tokens if word.isalpha()]

  return tokens

In [None]:
docs, DIDs = read_document('/content/drive/MyDrive/dataset/Hamshahri-Corpus.txt')

In [None]:
N = len(docs)

In [None]:

# this function creates two term_doc, and doc_term dictionaries
def create_tf_idf_dicts(docs, DIDs):
  
  # [key = T (term)] --> dict[key = D (doc)] --> number of times the term T appeared in doc D
  term_doc = {}

  # [key = D (doc)] --> dict[key = T (term)] --> number of times the term T appeared in doc D
  doc_term = {}

  for doc, DID in zip(docs, DIDs):
    terms = clean_text(doc)
    doc_term[DID] = {}

    for term in terms:
   
  # update doc_term[DID] dictionary

      # current term has appeared in document 'DID' in orevious steps:
      if term in doc_term[DID]:
        doc_term[DID][term] += 1
      
      # this is the first occurrance of the current term in document 'DID':
      else: 
        doc_term[DID][term] = 1

  # update term_doc[term] dictionary

      if not term in term_doc:
        term_doc[term] = {}

      # current term has appeared in document DID in previous steps:
      if DID in term_doc[term]:
          term_doc[term][DID] +=1
      
      # this is the first time the current term appears in document DID:
      else: 
        term_doc[term][DID] = 1     

  return term_doc, doc_term

In [None]:
term_doc, doc_term = create_tf_idf_dicts(docs, DIDs)

In [None]:
import math

# this function calculates the TF-IDF weight of input term within the input document
def get_tf_idf(term, doc):
  
  count_t_d = 0
  
  if doc in term_doc[term]:
    count_t_d = term_doc[term][doc]
    tf = 1 + math.log(count_t_d, 10)
  
  idf = get_IDF(term)

  return tf*idf


In [None]:
# this function calculates the IDF weight of input term
def get_IDF(term):
  
  df = 0
  
  for doc in doc_term:
    if term in doc_term[doc]:
      df += 1
  
  if df>0:
    idf = math.log((N/df), 2)
  
  return idf

In [None]:
get_IDF('تهران')

1.8190675501480003

![](https://drive.google.com/uc?export=view&id=1BpsCwRd2cee7lGHWXarOrtCxKu8a-f3B)



In [None]:
import numpy as np

# this function claculates the cosine similarity of the input documents 
def cosine_sim_of_docs(doc_1, doc_2):
  
  if not (doc_1 in doc_term and doc_2 in doc_term):
    print('Invalid document number')
    return 0

  dot_product = 0
  
  tf_doc_1 = doc_term[doc_1]
  tf_doc_2 = doc_term[doc_2]

  # compute the dot product of two document representations
  for term in tf_doc_1:   
    if term in tf_doc_2:

      dot_product += get_tf_idf(term, doc_1) *  get_tf_idf(term, doc_2) 
  
  length_doc_1 = 0
  length_doc_2 = 0

  # compute the length of doc_1's representation
  for term in tf_doc_1:
    length_doc_1 += get_tf_idf(term, doc_1) * get_tf_idf(term, doc_1)

  # compute the length of doc_2's representation
  for term in tf_doc_2:
    length_doc_2 += get_tf_idf(term, doc_2) * get_tf_idf(term, doc_2)

  # calculate the cosine similarity od doc_1 and doc_2
  cosine_sim = dot_product / (np.sqrt(length_doc_1) * np.sqrt(length_doc_2))

  return cosine_sim

In [None]:
cosine_sim_of_docs('244S1', '279S1')

0.01656446971653419

In [None]:
import numpy as np

# this function claculates the cosine similarity of the input term 
def cosine_sim_of_words(term_1, term_2):

  if not (term_1 in term_doc and term_2 in term_doc):
    print('out of vocab word!')
    return 0

  dot_product = 0
  
  doc_freq_term_1 = term_doc[term_1]
  doc_freq_term_2 = term_doc[term_2]

  idf_term_1 = get_IDF(term_1)
  idf_term_2 = get_IDF(term_2)
  
  # compute the dot product of two term representations
  for doc in doc_freq_term_1:   
    if doc in doc_freq_term_2:
      dot_product += get_tf_idf(term_1, doc) * get_tf_idf(term_2, doc) 
  
  length_doc_1 = 0
  length_doc_2 = 0


  # compute the length of term_1's representation
  for doc in doc_freq_term_1:
    length_doc_1 += get_tf_idf(term_1, doc) * get_tf_idf(term_1, doc)

  # compute the length of term_2's representation
  for doc in doc_freq_term_2:
    length_doc_2 += get_tf_idf(term_2, doc) * get_tf_idf(term_2, doc)

  # calculate the cosine similarity od term_1 and term_2
  cosine_sim = dot_product / (np.sqrt(length_doc_1) * np.sqrt(length_doc_2))

  return cosine_sim

### part 2: Word2vec by Gensim



In [None]:
# create input for gensim word2vec model

sentences = []
for doc in docs:
  sentences.append(clean_text(doc))

In [None]:
from gensim.models import Word2Vec

# sg: Training algorithm: 1 for skip-gram; otherwise CBOW.
# min_count: Ignores all words with total frequency lower than this.
# window: Maximum distance between the current and predicted word within a sentence.
# size: Dimensionality of the word vectors.

model = Word2Vec(sentences, min_count=1, size=100, window=10, sg=1)

In [None]:
# save the model
model.save("/content/drive/MyDrive/models/word2vec.model")

In [None]:
from gensim.models import Word2Vec
# load the model
model = Word2Vec.load("/content/drive/MyDrive/models/word2vec.model")

In [None]:
# train the model
model.train([["همشهری", "روزنامه"]], total_examples=1, epochs=1)

(1, 2)

In [None]:
# get vector representation of each term
vector = model.wv['خيابان'] 

In [None]:
vector

array([ 4.7829500e-01, -3.3936438e-01,  8.2576412e-01,  3.7145108e-01,
        6.7576092e-01,  5.8686972e-02,  4.8742581e-02,  1.2406912e-01,
       -4.3891722e-01,  7.0365928e-02, -3.4964731e-01, -1.3779813e-01,
        1.6468327e-01,  2.4502856e-01,  3.5093248e-01, -3.2618856e-01,
        5.9894842e-01,  1.0751355e+00,  1.6326882e-02, -1.5088303e-01,
        1.6317840e-01, -1.2954479e-01,  3.6789209e-01, -5.5537474e-01,
        1.9381578e-01, -1.3732421e-01, -3.4485805e-01,  1.7315307e-01,
        3.7723914e-01, -6.6332656e-01,  3.6547026e-01,  3.6495346e-01,
        2.6299605e-01,  2.7699494e-01,  5.5166113e-01, -3.7951469e-02,
        2.4369428e-02, -1.6798066e-01,  4.6101886e-01, -3.3071759e-01,
       -8.0350757e-01, -1.4511347e-01, -4.4636783e-01, -8.4169728e-01,
        3.7407890e-01, -6.7757732e-01,  3.2781810e-02,  1.1740174e-01,
       -3.2059025e-02,  2.0361900e-01,  1.8284015e-02,  5.0743729e-01,
       -7.9127812e-01,  1.4968026e-01, -3.1331035e-01,  6.5365329e-02,
      

In [None]:
# get the most similar term to the input term
sims = model.wv.most_similar('خيابان', topn=5)
print(sims)

[('چهارراه', 0.8367165327072144), ('وليعصر', 0.8330073356628418), ('كوچه', 0.8281815052032471), ('ميرداماد', 0.8225080370903015), ('ضلع', 0.8036145567893982)]


In [None]:
# get the cosine similarity of TF-IDF representations

print(cosine_sim_of_words('خيابان', 'كوچه'))
print(cosine_sim_of_words('خيابان', 'چهارراه'))
print(cosine_sim_of_words('خيابان', 'وليعصر'))
print(cosine_sim_of_words('خيابان', 'مطهري'))
print(cosine_sim_of_words('خيابان', 'تقاطع'))

0.37874202264156015
0.287817732181497
0.2704187837993078
0.25563191693756854
0.19136493917027125
