#TD-IDF and Word2Vec representation

## TF-IDF implementation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


![](https://doc-08-5g-docs.googleusercontent.com/docs/securesc/e04elm2ufc7fh57280o6p12sgt2ma7f7/gtdvo6m70a65c4dnrb6edde6f24d0ps6/1658036700000/14961254794536018283/12056060788794650457/1F4gXB_YhEKyJrQjtZ5e7UvzapsfhW1lR?e=view&ax=ACxEAsYDvoHKGg3MADwiQERtLTiTCfr569YuWdCLTqYSyLLKmu1T-f_RAD5JstkhrJuKAmjefoVwba8eHBK3JgCAkigfGtv5tJ4pu1YIsJDlYGEgvFjPdxrO7SvswK89kCJw6ufLUw_6pki_X-m0iSNm3pZtDeT4PAy9ukahZq-P3pfTbj30LSCpCIMOcC_wihBDi7f8KzNQxxe3yfavngeVqLeAyijPW035M8CvPG2k5BlT5lIek-0daFO91-1ZVjApHAXLhHAPZZPZIChrD3sIVR5zFsCJeF0IT9M9NvKXZZ6dMFByp0klju-abTD0T4Vhe0sGnL9ClQIHO5AQ_4jcW8U4-np9PWrSbbZbQZr1xDIJTw6iMfI5maKwn6bIs9Eo41jypbdvmnFgSyG7RZqydUgDU-oYzInvoVnczpkcGRZ3J6AXFJM9cqefaxITv__muVX6nvUvXXaflIbJHawXsyGqBk9yFslu-ZDubKzD3mNE8Bx7jVCCQEGF7wEDx85KxJIxymjuqxY1uLTEO-ftSAneCgQmfroSxoCRXEsCIkrwRDOtnQ0zZcunmJO5AR9POIAwg-9OhwjBXNTrX7OLnjcngSOmYyuEhFJG0PwqLLJLbM7FBwUP1eLnYmwsYJ6osuKAFhZF1sRU88MHnqzuZpQjiqaI7L3-0b9O5zjJF-9Z1MKEACe_QkDcuYomSli6eoCbqHRrluE81HGqyHjcpR7bcouwKKLdXkn2u3BysZEjvSvXYNTFQXDs40KTs0zZuGMyGOyNxADeN9YweYOQevXX&uuid=f771d88e-5b24-4a28-b3f0-e2d9caddf858&authuser=0&nonce=9c9lia8ir72gi&user=12056060788794650457&hash=lva9mhhmieldp7fvdg1q84m479k022jk)

### Reading corpus

In [2]:
# return docs and dids
def read_corpus(path):
  corpus = ''
  count = 0
  documents = []
  dids = []
  with open(path,'r',encoding='utf-8') as file:
    for line in file:
      if line.startswith('.DID') or line.startswith('date') or line.startswith('Cat'):
        if line.startswith('.DID'):
          if count !=0:
             documents.append(corpus)
          dids.append(line.split()[1])
          corpus = ''
          count += 1
        continue
      corpus += ' ' + line.strip()
  documents.append(corpus)
  return documents,dids

In [3]:
def clean_text(document):
  words = document.split(' ')
  words = [w.replace('\u200c','') for w in words]
  # third parameter says remove these character
  # first parameter says to replace that argument with the argument in second parameter
  translation_table = str.maketrans('', '', "><.،؟؛:{}\|+ـ()*&^٪$#❊!/[]=-")
  words = [w.translate(translation_table) for w in words]
  words = [w for w in words if w.isalpha()]
  return words


In [4]:
documents,dids = read_corpus('/content/drive/MyDrive/hamshahri.txt')

In [5]:
# used for idf
N = len(documents)

### Create tf_idf
we can create a matrix of words documents but its sparse so we use dictionaries.

In [6]:
def create_tf_idf_dics(docs,dids):
  # term-doc key=term value=dict(doc,number of times a term is in doc)
  term_doc = {}
  # doc-term key=doc value=dict(term,number of times a term is in doc)
  doc_term = {}
  for doc,did in zip(docs,dids):
    doc_term[did] = {}
    for term in clean_text(doc):
      if term in doc_term[did]:
        doc_term[did][term] +=1
      else:
        doc_term[did][term] = 1

      
      if term not in term_doc:
        term_doc[term] = {}
      if did in term_doc[term]:
        term_doc[term][did]+=1
      else:
        term_doc[term][did] = 1
      
  return term_doc, doc_term


In [7]:
term_doc, doc_term = create_tf_idf_dics(documents, dids)

In [8]:
import math
def get_tf_idf(doc,term):
  
  idf = 0
  if len(term_doc[term]) > 0: 
    idf = math.log(N/ len(term_doc[term]) ,2)

  tf = 0
  if doc in term_doc[term]:
    count = term_doc[term][doc]
    if count > 0:
      tf = 1 + math.log(count,10)
  return tf * idf
    

In [9]:
get_tf_idf('29', 'تو')

1.0793131635624436

### Calculate similarity using cosine formula

![cosine formula](https://drive.google.com/file/d/1lgCPHvEWUlmnlr1sCzPgOs6dAFo7rUds/view)

This function calculate the similarity of two input doc.

In [10]:
import numpy as np
def similarity_doc(doc_1,doc_2):
  if doc_1 in doc_term and doc_2 in doc_term:
    doc_product = 0
    for term in doc_term[doc_1]:
      if term in doc_term[doc_2]:
        doc_product += get_tf_idf(doc_1,term) * get_tf_idf(doc_2,term)
    length_doc_1 = sum([math.pow(get_tf_idf(doc_1,term),2) for term in doc_term[doc_1]])
    length_doc_2 = sum([math.pow(get_tf_idf(doc_2,term),2) for term in doc_term[doc_2]])
    return doc_product/(np.sqrt(length_doc_2) * np.sqrt(length_doc_1))

  else:
    print('Invalid doc number')
    return 0

In [11]:
similarity_doc('29', '25')

0.9678955789152439

This function calculate the similarity of two input term.

In [12]:
def similarity_term(term_1,term_2):
  if term_1 in term_doc and term_2 in term_doc:
    term_product = 0
    for doc in term_doc[term_1]:
      if doc in term_doc[term_2]:
        term_product += get_tf_idf(doc,term_1) * get_tf_idf(doc,term_2)
    length_term_1 = sum([math.pow(get_tf_idf(doc,term_1),2) for doc in term_doc[term_1]])
    length_term_2 = sum([math.pow(get_tf_idf(doc,term_2),2) for doc in term_doc[term_2]])
    return term_product/(np.sqrt(length_term_2) * np.sqrt(length_term_1))

  else:
    print('Invalid term ')
    return 0

In [13]:
similarity_term('عکس', 'تکی')

0.991786202274434

## Word2Vec by Gensim

In [14]:
from gensim.models import Word2Vec

In [15]:
# Create input for genism model
# array of array (each sentence one index and inside that index array of all words of a sentence)
sentences = [clean_text(doc) for doc in documents]

In [16]:
# sg: Training algorithm: 1 for skip-gram; otherwise CBOW.
# min_count: Ignores all words with total frequency lower than this.
# window: Maximum distance between the current and predicted word within a sentence.
# size: Dimensionality of the word vectors that it create.
model = Word2Vec(sentences, min_count=1, size=100, window=10, sg=1)

In [17]:
model.save("/content/drive/MyDrive/word2vec.model")

In [18]:
model = Word2Vec.load("/content/drive/MyDrive/word2vec.model")

In [19]:
# we can add as much pair as we want to train model
model.train([["گیاه", "روزنامه"]], total_examples=1, epochs=1)

(1, 2)

In [20]:
# convert a word to vec
vector = model.wv['سلفی']
vector

array([ 1.5813672e-03,  1.2250592e-02,  3.5644283e-03, -6.1967759e-03,
       -2.9380275e-03,  8.4605999e-03, -2.3571576e-03,  4.0819962e-03,
       -5.3468300e-03, -3.2291436e-03,  1.3192509e-03,  2.7267467e-03,
       -2.9795372e-03,  2.2962620e-03, -6.0755676e-03, -1.9646070e-03,
        1.0152564e-02,  1.0037105e-02,  9.7313646e-04,  9.3703670e-03,
       -1.2592118e-03, -9.1491302e-04, -7.2104260e-03, -5.2829175e-03,
        5.2133501e-03,  8.7235766e-03,  5.4762615e-03, -4.0895962e-03,
        2.4464112e-03,  7.3297778e-03,  2.4565714e-03,  7.5403382e-03,
        3.9700801e-03, -2.5619094e-03,  8.7603945e-03, -2.3964923e-03,
       -5.8297841e-03,  2.9495517e-03,  6.9962344e-03, -5.5420736e-04,
        1.5369299e-03,  1.5282126e-03, -8.0993408e-03,  1.8180669e-03,
       -3.5012821e-03, -5.7726218e-03,  4.0564132e-03, -6.2498306e-03,
       -4.8711358e-05, -6.0710073e-03,  2.9051770e-03, -8.9472355e-03,
        8.4255552e-03, -1.2638964e-03,  3.4579483e-04,  1.5046262e-03,
      

In [21]:
# get top n most similar word 
sims = model.wv.most_similar('سلفی',topn=5)
print(sims)

[('تو', 0.8157689571380615), ('من', 0.8078559637069702), ('عه', 0.8005035519599915), ('کنار', 0.7989203929901123), ('برات', 0.789409875869751)]


### Evaluate 

In [22]:
print(similarity_term('قشنگترین', 'سلفی'))
print(similarity_term('سلفی', 'کنار'))
print(similarity_term('سلفی', 'منم'))
print(similarity_term('سلفی', 'یه'))
print(similarity_term('سلفی', 'که'))

0.9842354972667257
1.0
0.816496580927726
1.0
0.9183257598178847
