In [57]:
import pandas as pd
import nltk
import string
import collections
import math
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk import bigrams

# Init Lemmatizer
lemmatizer = WordNetLemmatizer()
STORY = "story"
ASK_HN = "ask_hn"
SHOW_HN = "show_hn"
POLL = "poll"

remove_words_file = open("remove_words.txt","r")
remove_words = remove_words_file.read()
remove_words_file.close()

def get_pos_tag(word) :
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# def lemmatize_and_store(sentence, vocabulary, post_type_vocabulary) :
#     lemmatized_words = []
#     for word in nltk.word_tokenize(sentence.lower()) :
#         if word not in remove_words :
#             lemmatized_word = lemmatizer.lemmatize(word, get_pos_tag(word))
#             lemmatized_words.append(lemmatized_word)
#             add_to_vocabulary(lemmatized_word, vocabulary)
#             add_to_vocabulary(lemmatized_word, post_type_vocabulary)
#     return lemmatized_words


def get_words_and_frequncy(sentence):
    local_vocabulary = dict()
    words = []
    for word in nltk.word_tokenize(sentence.lower()) :
        word = word.replace("'", "").strip()
        word = word.replace("\"", "").strip()
        if len(word) == 0 :
            continue
        if word in remove_words :
            continue
        add_to_vocabulary(word, local_vocabulary, 1)
        words.append(word)
            
    string_bigrams = bigrams(words.copy())
    for gram in string_bigrams: 
        tag_1 = nltk.pos_tag([gram[0]])[0][1][0].upper()
        tag_2 = nltk.pos_tag([gram[1]])[0][1][0].upper()
        word = gram[0] + " " + gram[1]
        if tag_1 == "N" and tag_2 == "N":
            add_to_vocabulary(word, local_vocabulary, 1)
            reduce_frequency(gram[0], vocabulary)
            reduce_frequency(gram[1], vocabulary)
    
    return local_vocabulary


def add_to_vocabulary(word, vocabulary, frequency) :
    if word in vocabulary :
        vocabulary[word] += frequency
    else :
        vocabulary[word] = frequency


def reduce_frequency(word, vocabulary):
    if word in vocabulary :
        vocabulary[word] -= 1
        if vocabulary[word] == 0:
            del vocabulary[word]


# def get_lemmatized_words(sentence) :
#     lemmatized_words = []
#     words = []
#     for word in nltk.word_tokenize(sentence.lower()) :
#         if word in remove_words :
#             continue
#         # Add condition for remove words
#         words.append(word)
    
#     last_word = ""
#     string_bigrams = bigrams(words.copy())
#     for gram in string_bigrams: 
#         tag_1 = nltk.pos_tag([gram[0]])[0][1][0].upper()
#         tag_2 = nltk.pos_tag([gram[1]])[0][1][0].upper()
#         pair = gram[0] + " " + gram[1]
#         if tag_1 == "N" or tag_2 == "N":
#             words.append(pair)
#             words.remove(gram[0])
#             last_word = gram[1]
    
#     if last_word in words:
#         words.remove(last_word)
            
#     for word in words :
#         lemmatized_word = lemmatizer.lemmatize(word, get_pos_tag(word))
#         lemmatized_words.append(lemmatized_word)
            
#     return lemmatized_words


def get_lemmatized_words(words) :
    lemmatized_words = []
    for word in words :
        lemmatized_word = lemmatizer.lemmatize(word, get_pos_tag(word))
        lemmatized_words.append(lemmatized_word)
            
    return lemmatized_words


def create_vocabulary(posts, vocabulary, post_type_vocabulary) :
    for index, post in posts.iterrows():
        if index > 1000:
            break
        local_vocabulary = get_words_and_frequncy(post.Title)
        
        for word, count in local_vocabulary.items():
            lemmatized_word = lemmatizer.lemmatize(word, get_pos_tag(word))
            add_to_vocabulary(lemmatized_word, vocabulary, count)
            add_to_vocabulary(lemmatized_word, post_type_vocabulary, count)


# def add_to_bigram_vocabulary(word, bigram_vocabulary, post_type) :
#     if word in bigram_vocabulary :
#         bigram_vocabulary[word][0] += 1
#         if post_type in bigram_vocabulary[word][1] :
#             bigram_vocabulary[word][1][post_type] += 1
#         else :
#             bigram_vocabulary[word][1][post_type] = 1
#     else :
#         post_type_dict = dict()
#         post_type_dict[post_type] = 1
#         bigram_vocabulary[word] = [1, post_type_dict]


# def calculate_bigram_prob(bigram_vocabulary, vocabulary, bigram_probability, delta):
#     for gram in bigram_vocabulary:
#         prob = (bigram_vocabulary[gram][0] + DELTA) / (vocabulary[gram[0]] + (len(vocabulary) * delta))
#         bigram_probability[gram] = prob


def calculate_conditional_prob(values, word, post_type_vocab, post_type_total_words, vocabulary_size, delta):
    word_count = 0
    if word in  post_type_vocab:
        word_count = post_type_vocab[word]
    conditional_prob = (word_count + delta) / (post_type_total_words + (vocabulary_size * delta))
    values.append(word_count)
    #values.append(conditional_prob)
    values.append(round(math.log10(conditional_prob),10))


def create_line(line_no, title, values):
    line = str(line_no) + "  "  + title
    for value in values :
        line += "  " + str(value)
    line += "\n"
    return line


def calculate_score(words, training_model, post_type_probability, index):
    post_type_score = round(math.log10(post_type_probability),10)
    for word in words:
        if word in training_model:
            post_type_score += training_model[word][index]
            #post_type_score += round(math.log10(training_model[word][index]),10)
    return post_type_score
    
def predict_post_type(story_score, ask_score, show_score, poll_score):
    scores = [story_score, ask_score, show_score, poll_score]
    max_index = scores.index(max(scores))
    if max_index == 0:
        return STORY
    elif max_index == 1:
        return ASK_HN
    elif max_index == 2:
        return SHOW_HN
    else:
        return POLL

In [58]:
csvdf = pd.read_csv('hn2018_2019.csv', delimiter=',', encoding='utf-8')

data_2018 = csvdf[(csvdf["Created At"] >= "2018-01-01 00:00:00") & (csvdf["Created At"] <= "2018-12-31 23:59:59")]

total_post = data_2018.size
print("Total Post: ", total_post)

data_2018 = data_2018.groupby("Post Type")

story_posts = data_2018.get_group(STORY)
ask_posts = data_2018.get_group(ASK_HN)
show_posts = data_2018.get_group(SHOW_HN)
poll_posts = data_2018.get_group(POLL)


Total Post:  2492829


In [59]:
DELTA = 0.5

# Task 0 Start - Create Vocabulary
vocabulary = dict()
story_post_vocabulary = dict()
ask_post_vocabulary = dict()
show_post_vocabulary = dict()
poll_post_vocabulary = dict()
# bigram_vocabulary = dict()
# bigram_probability = dict()

create_vocabulary(story_posts, vocabulary, story_post_vocabulary)
create_vocabulary(ask_posts, vocabulary, ask_post_vocabulary)
create_vocabulary(show_posts, vocabulary, show_post_vocabulary)
create_vocabulary(poll_posts, vocabulary, poll_post_vocabulary)

# for gram in bigram_vocabulary:
#     print(gram, ": ", bigram_vocabulary[gram][0])

# calculate_bigram_prob(bigram_vocabulary, vocabulary, bigram_probability, DELTA)

# bigram_probability = collections.OrderedDict(sorted(bigram_probability.items(), key=lambda kv:kv[1], reverse=True))
# for gram in bigram_probability:
#     print(gram, ": ", bigram_probability[gram])

vocabulary_file = open("vocabulary.txt", "w")
for word, count in vocabulary.items():
    vocabulary_file.write(word+" "+str(count)+"\n")
vocabulary_file.close()

print("Vocabulary Done..")
# Task 0 End


# Task 1 Start - Build the model
training_model = dict()

story_post_total_words = sum(story_post_vocabulary.values())
ask_post_total_words = sum(ask_post_vocabulary.values())
show_post_total_words = sum(show_post_vocabulary.values())
poll_post_total_words = sum(poll_post_vocabulary.values())
vocabulary_size = len(vocabulary)

# print(story_posts.size)
# print(ask_posts.size)
# print(show_posts.size)
# print(poll_posts.size)
story_probability = story_posts.size / total_post
ask_probability = ask_posts.size / total_post
show_probability = show_posts.size / total_post
poll_probability = poll_posts.size / total_post

# Sort vocabulary alphabetically
vocabulary = collections.OrderedDict(sorted(vocabulary.items(), key=lambda kv:kv[0]))

model_file = open("model-2018.txt", "w")
line_no = 0

for word in vocabulary.keys():
    line_no += 1
    values = []
    calculate_conditional_prob(values, word, story_post_vocabulary, story_post_total_words, vocabulary_size, DELTA)
    calculate_conditional_prob(values, word, ask_post_vocabulary, ask_post_total_words, vocabulary_size, DELTA)
    calculate_conditional_prob(values, word, show_post_vocabulary, show_post_total_words, vocabulary_size, DELTA)
    calculate_conditional_prob(values, word, poll_post_vocabulary, poll_post_total_words, vocabulary_size, DELTA)
    training_model[word] = values
    
    model_file.write(create_line(line_no, word, values))
    
model_file.close()
# Task 1 End

print("Task 1 Done..")

Vocabulary Done..
Task 1 Done..


In [56]:
# Task 2 Start - Test dataset
data_2019 = csvdf[(csvdf["Created At"] >= "2019-01-01 00:00:00") & (csvdf["Created At"] <= "2019-12-31 23:59:59")]

baseline_result = open("baseline-result.txt", "w", encoding="utf-8")
line_no = 0

for index, post in data_2019.iterrows():
    if line_no > 1000:
        break
    line_no += 1
    words = get_words_and_frequncy(post.Title)
    lemmatized_words = get_lemmatized_words(words.keys())
    
    story_score = calculate_score(lemmatized_words, training_model, story_probability, 1)
    ask_score = calculate_score(lemmatized_words, training_model, ask_probability, 3)
    show_score = calculate_score(lemmatized_words, training_model, show_probability, 5)
    poll_score = calculate_score(lemmatized_words, training_model, poll_probability, 7)
    
    predicted_post_type = predict_post_type(story_score, ask_score, show_score, poll_score)
    original_post_type = post["Post Type"]
    output = "right" if original_post_type == predicted_post_type else "wrong"
    values = [original_post_type, story_score, ask_score, show_score, poll_score, predicted_post_type, output]
    
    baseline_result.write(create_line(line_no, post.Title, values))
    

baseline_result.close()
# Task 2 End

print("Task 2 Done..")

Task 2 Done..


In [62]:
# Lemmatize a Sentence with the appropriate POS tag
sentence = """Following mice attacks MySQL 10% HN: on UAE ASK-HR Dr. Ph.D. sagar's $300 etc. caring farmers were marching to Delhi for better living conditions. 
Delhi police on Tuesday fired water cannons and teargas shells at protesting farmers as they tried to 
break barricades with their cars, automobiles and tractors."""

sentence = "The Tech That Was Fixed in 2018 and the Tech That Still Needs Fixing"

print("\nOrignal Sentence: ")
print(sentence)

print("\nNew Sentence: ")
newSentence = sentence
# newSentence = get_lemmatized_words(sentence)
print(newSentence)
print("\nPunctuation",string.punctuation)

print("\n")
string_bigrams = bigrams(nltk.word_tokenize(sentence.lower()))
for gram in string_bigrams: 
    print(gram[0]+" "+gram[1])

print("\nSplit:",newSentence)
word = "enjoyed"
print(nltk.pos_tag([word])[0][1][0].upper())
print(get_pos_tag(word))
print(lemmatizer.lemmatize(word, get_pos_tag(word)))
# print(get_lemmatized_words(word))
print(wordnet.NOUN.upper())

print(round(10.87348434, 5))



Orignal Sentence: 
The Tech That Was Fixed in 2018 and the Tech That Still Needs Fixing

New Sentence: 
The Tech That Was Fixed in 2018 and the Tech That Still Needs Fixing

Punctuation !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


the tech
tech that
that was
was fixed
fixed in
in 2018
2018 and
and the
the tech
tech that
that still
still needs
needs fixing

Split: The Tech That Was Fixed in 2018 and the Tech That Still Needs Fixing
V
v
enjoy
N
10.87348


In [7]:
sentence = "The Tech That Was Fixed in 2018 and the Tech That Still Needs Fixing"
lemmatized_words = []
words = []
print(sentence)
print(nltk.word_tokenize(sentence.lower()))
for word in nltk.word_tokenize(sentence.lower()) :
    if word not in string.punctuation :
        words.append(word)
    # Add condition for remove words
#     words.append(word)
print(words)
            
string_bigrams = bigrams(words.copy())
# for g in string_bigrams: 
#     print(g)
last_word = ""
for gram in string_bigrams: 
    print(gram)
    p1 = gram[0]
    p2 = gram[1]
    tag_1 = nltk.pos_tag([p1])[0][1][0].upper()
    tag_2 = nltk.pos_tag([p2])[0][1][0].upper()
    pair = p1 + " " + p2
    if tag_1 == "N" or tag_2 == "N":
        words.append(pair)
        print("Pair:", pair)
        print(p1)
        print(p2)
        words.remove(p1)
#         words.remove(p2)
        last_word = p2
        print(words)
words.remove(p2)        
for w in words :
    lemmatized_word = lemmatizer.lemmatize(w, get_pos_tag(w))
    lemmatized_words.append(lemmatized_word)
print(lemmatized_words)

The Tech That Was Fixed in 2018 and the Tech That Still Needs Fixing
['the', 'tech', 'that', 'was', 'fixed', 'in', '2018', 'and', 'the', 'tech', 'that', 'still', 'needs', 'fixing']
['the', 'tech', 'that', 'was', 'fixed', 'in', '2018', 'and', 'the', 'tech', 'that', 'still', 'needs', 'fixing']
('the', 'tech')
Pair: the tech
the
tech
['tech', 'that', 'was', 'fixed', 'in', '2018', 'and', 'the', 'tech', 'that', 'still', 'needs', 'fixing', 'the tech']
('tech', 'that')
Pair: tech that
tech
that
['that', 'was', 'fixed', 'in', '2018', 'and', 'the', 'tech', 'that', 'still', 'needs', 'fixing', 'the tech', 'tech that']
('that', 'was')
('was', 'fixed')
('fixed', 'in')
('in', '2018')
('2018', 'and')
('and', 'the')
('the', 'tech')
Pair: the tech
the
tech
['that', 'was', 'fixed', 'in', '2018', 'and', 'tech', 'that', 'still', 'needs', 'fixing', 'the tech', 'tech that', 'the tech']
('tech', 'that')
Pair: tech that
tech
that
['that', 'was', 'fixed', 'in', '2018', 'and', 'that', 'still', 'needs', 'fixing'