In [2]:
import numpy as np
import pandas as pd
import nltk

In [3]:
#Download requirements
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pouri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
data = pd.read_csv('Data/digikala_comment.csv')
data

Unnamed: 0,comment
0,نسبت به قیمتش ارزش خرید داره\nجاداره، طراحیش ق...
1,چند ماهی میشه که گرفتمش‌. برای برنامه نویسی و ...
2,پراید ستون جدید
3,اقا همه چیش خوبه فقط از پایین زیاد حاشیه داره ...
4,گوسی هو اوی p10 lite سیپیو و دوربین و رمش از ا...
...,...
235,پوشش دهی صفر.اصلا پیشنهاد نمیکنم
236,نصب این فن خیلی راحته و دردسر زیادی نداره درض...
237,بی کیفیت
238,سلام ٬ چندماهی میشه این پاور بانک رو تهیه کردم...


In [5]:
# Sentence Tokenizing
from nltk.tokenize import sent_tokenize

def sentence_tokenizing(text):
    return sentence_tokenizing(text)

data['comment'] = data.apply(lambda text:sent_tokenize(text['comment']), axis=1)
data

Unnamed: 0,comment
0,[نسبت به قیمتش ارزش خرید داره\nجاداره، طراحیش ...
1,"[چند ماهی میشه که گرفتمش‌., برای برنامه نویسی ..."
2,[پراید ستون جدید]
3,[اقا همه چیش خوبه فقط از پایین زیاد حاشیه داره...
4,[گوسی هو اوی p10 lite سیپیو و دوربین و رمش از ...
...,...
235,[پوشش دهی صفر.اصلا پیشنهاد نمیکنم]
236,[نصب این فن خیلی راحته و دردسر زیادی نداره در...
237,[بی کیفیت]
238,[سلام ٬ چندماهی میشه این پاور بانک رو تهیه کرد...


In [6]:
import re
import string

def clean_fa_text(text):
    #removing english characters and signs
    text = ''.join([i for i in text if not ((65 <= ord(i) <91)
                                            or (97 <= ord(i) < 123)
                                            or (48 <= ord(i) < 58))])

    #removing nbsp
    text_list = []
    for char in text:
        if ord(char) == 160:
            text_list.append(' ')
            continue
        text_list.append(char)
    text = ''.join(text_list)

    # removing sign
    text = ''.join([i for i in text if ord(i) not in [33, 34, 35, 36, 37, 38,
                                                      39, 40, 41, 42, 43, 44,
                                                      45, 46, 47, 58, 59, 60,
                                                      61, 62, 63, 64, 91, 92,
                                                      93, 94, 95, 96, 123, 124,
                                                      125, 126, 1548, 1567]])
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def apply_clean(list_sentence: list):
    new_list = []
    for sent in list_sentence:
        new_list.append(clean_fa_text(sent))
    return new_list

data['comment'] = data['comment'].apply(apply_clean)
data

Unnamed: 0,comment
0,[نسبت به قیمتش ارزش خرید دارهجاداره طراحیش قشن...
1,"[چند ماهی میشه که گرفتمش‌, برای برنامه نویسی و..."
2,[پراید ستون جدید]
3,[اقا همه چیش خوبه فقط از پایین زیاد حاشیه داره...
4,[گوسی هو اوی سیپیو و دوربین و رمش از این خیل...
...,...
235,[پوشش دهی صفراصلا پیشنهاد نمیکنم]
236,[نصب این فن خیلی راحته و دردسر زیادی نداره در...
237,[بی کیفیت]
238,[سلام ٬ چندماهی میشه این پاور بانک رو تهیه کرد...


In [None]:
#text normalizing
from hazm import Normalizer

def apply_normalizer(list_sentence: list):
    new_list = []
    norm = Normalizer()
    for sent in list_sentence:
        new_list.append(norm.normalize(sent))
    return new_list

data['comment'] = data['comment'].apply(apply_normalizer)
data

In [None]:
#concat all sentences
def append_all(data, column: str):
    all_sentences = []
    for sent in data[column]:
        all_sentences.extend(sent)
    return all_sentences

all_sentences = append_all(data, 'comment')
all_sentences

In [None]:
#splite sentence wrt words

def split_sentences(data: list):
    dat=[]
    for i in range(len(data)):
        for word in data[i].split():
            dat.append(word)
    return dat

all_words = split_sentences(all_sentences)
all_words

In [None]:
#unigram model

def create_unigram(data):
    list_of_unigrams = []
    unigram_counts = {}

    for i in range(len(data)):
        list_of_unigrams.append(data[i])

        if data[i] not in unigram_counts:
            unigram_counts[data[i]] = 0
        unigram_counts[data[i]] += 1


    return list_of_unigrams, unigram_counts

def unigram_probability(list_of_unigram, unigram_counts: dict, k: int=1):
    prob_list = {}
    len_all_words = sum(unigram_counts.values())

    for unigram in list_of_unigram:
        prob_list[unigram] = (unigram_counts.get(unigram) + k)/(len_all_words + k*len_all_words)
    return prob_list

list_of_unigram, first_unigram_counts = create_unigram(all_words)
unigram_prob = unigram_probability(list_of_unigram, first_unigram_counts)

In [None]:
# bigram model

def create_bigram(data):
    list_of_bigrams = []
    bigram_counts = {}
    unigram_counts = {}

    for i in range(len(data)-1):
        if i < len(data) - 1 and data[i+1]:
            list_of_bigrams.append((data[i], data[i + 1]))

            if (data[i], data[i+1]) in bigram_counts:
                bigram_counts[(data[i], data[i + 1])] += 1
            else:
                bigram_counts[(data[i], data[i + 1])] = 1

        if data[i] in unigram_counts:
            unigram_counts[data[i]] += 1
        else:
            unigram_counts[data[i]] = 1

    return list_of_bigrams, unigram_counts, bigram_counts


def bigram_probability(list_of_bigrams, unigram_counts, bigram_counts, k: int=1):
    prob_list = {}
    for bigram in list_of_bigrams:
        word1 = bigram[0]
        prob_list[bigram] = (bigram_counts.get(bigram))/(unigram_counts.get(word1) + len(unigram_counts)*k)
    return prob_list


def get_bigram(sentence: str):
    splt = sentence.split()
    bilist = []

    for i in range(len(splt) - 1):
        bilist.append((splt[i], splt[i + 1]))

    return bilist


list_of_bigrams, unigram_counts, bigram_counts = create_bigram(all_words)
bigram_prob = bigram_probability(list_of_bigrams, unigram_counts, bigram_counts)

In [None]:
#trigram model

def create_trigram(data):
    list_of_trigrams = []
    trigram_counts = {}
    bigram_counts = {}


    for i in range(len(data)-2):
        if i < len(data) - 2 and data[i+2]:
            list_of_trigrams.append((data[i], data[i + 1], data[i + 2]))

            if (data[i], data[i + 1], data[i + 2]) in trigram_counts:
                trigram_counts[(data[i], data[i + 1], data[i + 2])] += 1
            else:
                trigram_counts[(data[i], data[i + 1], data[i + 2])] = 1

        if (data[i], data[i+1]) in bigram_counts:
            bigram_counts[(data[i], data[i + 1])] += 1
        else:
            bigram_counts[(data[i], data[i + 1])] = 1

    return list_of_trigrams, bigram_counts, trigram_counts


def trigram_probability(list_of_trigrams, bigram_counts, trigram_counts, k: int=1):
    prob_list = {}
    for trigram in list_of_trigrams:
        word1 = trigram[0]
        word2 = trigram[1]
        prob_list[trigram] = (trigram_counts.get(trigram))/(bigram_counts.get((word1, word2)) + len(bigram_counts)*k)
    return prob_list

def get_trigram(sentence: str):
    splt = sentence.split()
    bilist = []

    for i in range(len(splt) - 2):
        bilist.append((splt[i], splt[i + 1], splt[i + 2]))

    return bilist

list_of_trigrams, bigram_counts, trigram_counts = create_trigram(all_words)
trigram_prob = trigram_probability(list_of_trigrams, bigram_counts, trigram_counts)

In [None]:
#most probable unigram
most_prob_unigram = pd.DataFrame(unigram_prob.items(), columns=['unigram', 'probability']).sort_values(by='probability', ascending=False)[:8]
most_prob_unigram

In [None]:
#most probable bigrams
most_prob_bigram = pd.DataFrame(bigram_prob.items(), columns=['bigram', 'probability']).sort_values(by='probability', ascending=False)[:8]
most_prob_bigram

In [None]:
#most probable bigrams
most_prob_trigram = pd.DataFrame(trigram_prob.items(), columns=['trigram', 'probability']).sort_values(by='probability', ascending=False)[:8]
most_prob_trigram

In [None]:
#probability calculation of test set

from hazm import Normalizer

def get_probability(list_sentence: list, ngram_probability):
    probability = 1
    for i in range(len(list_sentence)):
        if list_sentence[i] in ngram_probability:
            probability *= ngram_probability[list_sentence[i]]
        else:
            probability *= 0.00001
    return probability

def get_perplexity(list_sentence: list, ngram_probability):
    probability = 1
    for i in range(len(list_sentence)):
        if list_sentence[i] in ngram_probability:
            probability *= ngram_probability[list_sentence[i]]
        else:
            probability *= 0.00001

    m = len(list_sentence)
    perplexity = np.power(1/probability, 1/m)
    return perplexity


sentence1 = 'این لپتاپ سخت افزار خیلی قوی داره و از پس هرکاری به راحتی برمیاد'
sentence2 = 'این ساعت بسیار زیبا طراحی و ساخته شده'
sentence3 = 'یک محصول باکیفیت ایرانی که حقیقتا جای حمایت داره'
sentence4 = 'بوش و ماندگاری خوب هست من خیلی دوستش دارم'


normalizer = Normalizer()

sent1_unigram = sentence1.split()
sent1_bigram = get_bigram(normalizer.normalize(sentence1))
sent1_trigram = get_trigram(normalizer.normalize(sentence1))
sent1_unigram_prob = get_probability(sent1_unigram, unigram_prob)
sent1_bigram_prob = get_probability(sent1_bigram, bigram_prob)
sent1_trigram_prob = get_probability(sent1_trigram, trigram_prob)
sent1_unigram_perplexity = get_perplexity(sent1_unigram, unigram_prob)
sent1_bigram_perplexity = get_perplexity(sent1_bigram, bigram_prob)
sent1_trigram_perplexity = get_perplexity(sent1_trigram, trigram_prob)


sent2_unigram = sentence2.split()
sent2_bigram = get_bigram(normalizer.normalize(sentence2))
sent2_trigram = get_trigram(normalizer.normalize(sentence2))
sent2_unigram_prob = get_probability(sent2_unigram, unigram_prob)
sent2_bigram_prob = get_probability(sent2_bigram, bigram_prob)
sent2_trigram_prob = get_probability(sent2_trigram, trigram_prob)
sent2_unigram_perplexity = get_perplexity(sent2_unigram, unigram_prob)
sent2_bigram_perplexity = get_perplexity(sent2_bigram, bigram_prob)
sent2_trigram_perplexity = get_perplexity(sent2_trigram, trigram_prob)

sent3_unigram = sentence3.split()
sent3_bigram = get_bigram(normalizer.normalize(sentence3))
sent3_trigram = get_trigram(normalizer.normalize(sentence3))
sent3_unigram_prob = get_probability(sent3_unigram, unigram_prob)
sent3_bigram_prob = get_probability(sent3_bigram, bigram_prob)
sent3_trigram_prob = get_probability(sent3_trigram, trigram_prob)
sent3_unigram_perplexity = get_perplexity(sent3_unigram, unigram_prob)
sent3_bigram_perplexity = get_perplexity(sent3_bigram, bigram_prob)
sent3_trigram_perplexity = get_perplexity(sent3_trigram, trigram_prob)

sent4_unigram = sentence4.split()
sent4_bigram = get_bigram(normalizer.normalize(sentence4))
sent4_trigram = get_trigram(normalizer.normalize(sentence4))
sent4_unigram_prob = get_probability(sent4_unigram, unigram_prob)
sent4_bigram_prob = get_probability(sent4_bigram, bigram_prob)
sent4_trigram_prob = get_probability(sent4_trigram, trigram_prob)
sent4_unigram_perplexity = get_perplexity(sent4_unigram, unigram_prob)
sent4_bigram_perplexity = get_perplexity(sent4_bigram, bigram_prob)
sent4_trigram_perplexity = get_perplexity(sent4_trigram, trigram_prob)


result_table = pd.DataFrame({'Unigram Probability':[sent1_unigram_prob, sent2_unigram_prob, sent3_unigram_prob, sent4_unigram_prob],
                             'Bigram Probability': [sent1_bigram_prob, sent2_bigram_prob, sent3_bigram_prob, sent4_bigram_prob],
                             'Trigram Probability': [sent1_trigram_prob, sent2_trigram_prob, sent3_trigram_prob, sent4_trigram_prob],
                             'Unigram Perplexity': [sent1_unigram_perplexity, sent2_unigram_perplexity, sent3_unigram_perplexity, sent4_unigram_perplexity],
                             'Bigram Perplexity': [sent1_bigram_perplexity, sent2_bigram_perplexity, sent3_bigram_perplexity, sent4_bigram_perplexity],
                             'Trigram Perplexity': [sent1_trigram_perplexity, sent2_trigram_perplexity, sent3_trigram_perplexity, sent4_trigram_perplexity]},
                            index=['sentence1', 'sentence2', 'sentence3', 'sentence4'])
result_table

In [None]:
import random

def generate_sentence_trigram(first_sentence: str, input_sentence: list, ngram_probability: dict):

    sentence_max_len = 12 - len(input_sentence)
    predicted_aprt = []

    for _ in range(sentence_max_len):
        current_words = input_sentence[-1]

        probable_words = {}
        for tokens in ngram_probability.keys():
            if tokens[0] == current_words[-1] and tokens[1] == current_words[-2] and tokens[2] == current_words[-3]:
                probable_words[tokens] = ngram_probability[tokens]

            if not probable_words:
                if tokens[0] == current_words[-1] and tokens[1] == current_words[-2]:
                    probable_words[tokens] = ngram_probability[tokens]


            if not probable_words:
                if tokens[0] == current_words[-1]:
                    probable_words[tokens] = ngram_probability[tokens]

        new_word = random.choices(list(probable_words.keys()), weights=probable_words.values())[0]
        new_word = list(new_word)
        predicted_aprt.append(new_word[0:])
        input_sentence.append(new_word[0:])

    index = 0
    for np in predicted_aprt:
        first_sentence  += ' '.join(np[1:]) + ' '
        index += 1

    return first_sentence, predicted_aprt


def generate_sentence_bigram(first_sentence: str, input_sentence: list, ngram_probability: dict):

    sentence_max_len = 12 - len(input_sentence)
    predicted_aprt = []

    for _ in range(sentence_max_len):
        current_words = input_sentence[-1]

        probable_words = {}
        for tokens in ngram_probability.keys():
            if tokens[0] == current_words[-1] and tokens[1] == current_words[-2]:
                probable_words[tokens] = ngram_probability[tokens]

            if not probable_words:
                if tokens[0] == current_words[-1]:
                    probable_words[tokens] = ngram_probability[tokens]

        new_word = random.choices(list(probable_words.keys()), weights=probable_words.values())[0]
        new_word = list(new_word)
        predicted_aprt.append(new_word[0:])
        input_sentence.append(new_word[0:])

    index = 0
    for np in predicted_aprt:
        first_sentence  += ' '.join(np[1:]) + ' '
        index += 1

    return first_sentence, predicted_aprt

In [None]:
sentence1 = 'کیفیت محصولات چینی زرین '
sentence2 = 'از لحاظ جنس جنس خوبی داره '
sentence3 = 'حتما پیشنهاد میکنم '
sentence4 = 'بعد از چند روز استفاده '

normalizer = Normalizer()

sent1_bigram = get_bigram(normalizer.normalize(sentence1))
sent1_trigram = get_trigram(normalizer.normalize(sentence1))

sent2_bigram = get_bigram(normalizer.normalize(sentence2))
sent2_trigram = get_trigram(normalizer.normalize(sentence2))

sent3_bigram = get_bigram(normalizer.normalize(sentence3))
sent3_trigram = get_trigram(normalizer.normalize(sentence3))

sent4_bigram = get_bigram(normalizer.normalize(sentence4))
sent4_trigram = get_trigram(normalizer.normalize(sentence4))


full_sent1_bi, predicted_aprt1_bi = generate_sentence_bigram(sentence1, sent1_bigram, bigram_prob)
full_sent2_bi, predicted_aprt2_bi = generate_sentence_bigram(sentence2, sent2_bigram, bigram_prob)
full_sent3_bi, predicted_aprt3_bi = generate_sentence_bigram(sentence3, sent3_bigram, bigram_prob)
full_sent4_bi, predicted_aprt4_bi = generate_sentence_bigram(sentence4, sent4_bigram, bigram_prob)

full_sent1_bi_perplexity = get_perplexity(get_bigram(full_sent1_bi), bigram_prob)
full_sent2_bi_perplexity = get_perplexity(get_bigram(full_sent2_bi), bigram_prob)
full_sent3_bi_perplexity = get_perplexity(get_bigram(full_sent3_bi), bigram_prob)
full_sent4_bi_perplexity = get_perplexity(get_bigram(full_sent4_bi), bigram_prob)

full_sent1_tri, predicted_aprt1_tri = generate_sentence_trigram(sentence1, sent1_trigram, trigram_prob)
full_sent2_tri, predicted_aprt2_tri = generate_sentence_trigram(sentence2, sent2_trigram, trigram_prob)
full_sent3_tri, predicted_aprt3_tri = generate_sentence_trigram(sentence3, sent3_trigram, trigram_prob)
full_sent4_tri, predicted_aprt4_tri = generate_sentence_trigram(sentence4, sent4_trigram, trigram_prob)

full_sent1_tri_perplexity = get_perplexity(get_trigram(full_sent1_tri), trigram_prob)
full_sent2_tri_perplexity = get_perplexity(get_trigram(full_sent2_tri), trigram_prob)
full_sent3_tri_perplexity = get_perplexity(get_trigram(full_sent3_tri), trigram_prob)
full_sent4_tri_perplexity = get_perplexity(get_trigram(full_sent4_tri), trigram_prob)



predicted_sentence_res = pd.DataFrame({'sentence': [sentence1, sentence2, sentence3, sentence4],
                                       'predicted with bigram':[full_sent1_bi, full_sent2_bi, full_sent3_bi, full_sent4_bi],
                                       'predicted perplexity bigram': [full_sent1_bi_perplexity, full_sent2_bi_perplexity, full_sent3_bi_perplexity, full_sent4_bi_perplexity],
                                       'predicted with trigram': [full_sent1_tri, full_sent2_tri, full_sent3_tri, full_sent4_tri],
                                       'predicted perplexity trigram': [full_sent1_tri_perplexity, full_sent2_tri_perplexity, full_sent3_tri_perplexity, full_sent4_tri_perplexity]})

predicted_sentence_res

In [None]:
# Pos Tagging
from hazm import POSTagger

posTagger = POSTagger(model='pos_tagger.model')
tagged_all_words = posTagger.tag(tokens = all_words)
tagged_all_words

In [None]:
# count occurrence of all tokens

def get_pos_occurrence(tagged_data: list):
    pos_counter = {}  #key: pos  value: number of pos occurrence
    for word, tag in tagged_data:
        if tag not in pos_counter.keys():
            pos_counter[tag] = 1
        else:
            pos_counter[tag] += 1
    return pos_counter

pos_counted = get_pos_occurrence(tagged_all_words)
pos_counted_df = pd.DataFrame(data={'words': pos_counted.keys(), 'occurrence': pos_counted.values()})
pos_counted_df

In [None]:
# showing the most observed Nouns

def get_name_occurrence(tagged_data: list):
    name_counter = {}  #key: pos  value: number of pos occurrence
    for word, tag in tagged_data:
        if tag == 'NOUN':
            if word not in name_counter.keys():
                name_counter[word] = 0
            name_counter[word] += 1
    return name_counter

name_counted = get_name_occurrence(tagged_all_words)
name_counted_df = pd.DataFrame(data={'name': name_counted.keys(), 'occurrence': name_counted.values()}).sort_values(by='occurrence', ascending=False)[:15]
name_counted_df