In [None]:
import os 
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm 
import math
from deep_translator import GoogleTranslator
from indicnlp.tokenize import indic_tokenize

In [None]:
stop=open('../utils/hindi.txt')
hindi_stop_words=[]
for x in stop:
    hindi_stop_words.append(x.strip("\n"))

def tokenize_hindi(sentence):
    tokens = indic_tokenize.trivial_tokenize(sentence, lang='hi')
    
    filtered_tokens = [token for token in tokens if token not in hindi_stop_words]

    return filtered_tokens

In [None]:
def extractTokens(df):
    tokensCollection=[]
    for i, row in df.iterrows():
        tokens=[tokenize_hindi(sentence ) for sentence in row['sentences']]
        tokensCollection.append(tokens)
    df['tokens'] = tokensCollection
    return df

In [None]:
def summarize_lexRank(sentences,tokens,threshold=0.1,epsilon=0.1):
    freq_matrix = {}
    ts=len(tokens)
    for i,token in enumerate(tokens):
        freq_table = {}
        words = token
        for word in words:
            word = word.lower()
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        freq_matrix[i] = freq_table
    tf_matrix = {}
    c = 0
    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[c] = tf_table
        c = c+1
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1
    idf_matrix = {}
    idf_table = {}
    for sent, f_table in freq_matrix.items():

        for word in f_table.keys():
          if word not in idf_table:
            idf_table[word] = math.log10(ts / float(word_per_doc_table[word]))
          else:
            pass

        idf_matrix[sent] = idf_table
    matrix = np.zeros((ts, ts))
    s = {}
    for i in range(ts):
      for j in range(ts):
        u1 = tokens[i]
        u2 = tokens[j]
        common = list(set(u1) & set(u2))
        d1=0
        d2=0
        n=0.0
        for t in u1:
          if t in tf_matrix[i]:
            tf1=tf_matrix[i][t]
          else:
            tf1=0
          if t in idf_table:
            idf1=idf_table[t]
          else:
            idf1=0
          d1+=(tf1*idf1)**2
        for t in u2:
          if t in tf_matrix[j]:
            tf2=tf_matrix[j][t]
          else:
            tf2=0
          if t in idf_table:
            idf2=idf_table[t]
          else:
            idf2=0
          d2+=(tf2*idf2)**2
        for t in common:
          if t in tf_matrix[i]:
            tfc1=tf_matrix[i][t]
          else:
            tfc1=0
          if t in tf_matrix[j]:
            tfc2=tf_matrix[j][t]
          else:
            tfc2=0
          if t in idf_table:
            idf=idf_table[t]
          else:
            idf=0
          n+=tfc1*tfc2*idf**2
        if d1 > 0 and d2 > 0:
          matrix[i][j] =  n / (math.sqrt(d1) * math.sqrt(d2))
        else:
          matrix[i][j] = 0.0
    degrees = np.zeros((ts, ))
    for i in range(ts):
      for j in range(ts):
        if matrix[i, j] > threshold:
          matrix[i, j] = 1.0
          degrees[i] += 1
        else:
          matrix[i, j] = 0
    for i in range(ts):
        for j in range(ts):
            if degrees[i] == 0:
                degrees[i] = 1

            matrix[i][j] = matrix[i][j] / degrees[i]
    transposed_matrix = matrix.T
    p_vector = np.array([1.0 / ts] * ts)
    lambda_val = 1.0

    while lambda_val > epsilon:
      next_p = np.dot(transposed_matrix, p_vector)
      lambda_val = np.linalg.norm(np.subtract(next_p, p_vector))
      p_vector = next_p
    avg = np.sum(p_vector) / len(p_vector)
    sentence_ids = []
    for i in range(ts):
      if(p_vector[i]>=avg):
        sentence_ids.append(i)
    summary = [sentences[i] for i in sentence_ids]
    return summary

In [None]:
input_path="../data/"
output_path="../results/summary/"
if os.path.exists(output_path) == False:
    os.mkdir(output_path)

data = pd.read_csv(f'{input_path}data.csv')
data['sentences'] = data['sentences'].apply(eval)
sentences = data['sentences']
data = extractTokens(data)
tokens = data['tokens'] 
summary=[]
for i in tqdm(range(len(sentences))):
    summary.append(summarize_lexRank( sentences[i] ,tokens[i]))

data['summary'] = summary

data.to_csv(f'{output_path}lexrank.csv', index=False)