In [16]:
import pandas as pd
import nltk
import string
import collections
import math
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk import bigrams

# Init Lemmatizer
lemmatizer = WordNetLemmatizer()

is_stopword_exp = False
is_wordlength_exp = False
remove_words = []
stop_words = []


class HackerNews:
    """
    This class is used to hold all posts of year 2018, complete vocabulary,
    post type wise vocabulary, post type constants, training model and pobability of each post type.
    """
    
    def __init__(self):
        self.STORY = "story"
        self.ASK_HN = "ask_hn"
        self.SHOW_HN = "show_hn"
        self.POLL = "poll"
        self.total_post = 0
        self.story_posts = []
        self.ask_posts = []
        self.show_posts = []
        self.poll_posts = []
        self.vocabulary = dict()
        self.story_post_vocabulary = dict()
        self.ask_post_vocabulary = dict()
        self.show_post_vocabulary = dict()
        self.poll_post_vocabulary = dict()
        self.training_model = dict()
        self.story_probability = 0.0
        self.ask_probability = 0.0
        self.show_probability = 0.0
        self.poll_probability = 0.0



def get_pos_tag(word) :
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def get_words_and_frequncy(sentence):
    local_vocabulary = dict()
    words = []
    
    for word in nltk.word_tokenize(sentence.lower()) :
        word = word.replace("'", "").strip()
        word = word.replace("\"", "").strip()
        if len(word) == 0 :
            continue
        if word in remove_words :
            continue
        if is_stopword_exp and word in stop_words :
            continue
        if is_wordlength_exp and len(word) <= 2 and len(word) >= 9:
            continue
        add_to_vocabulary(word, local_vocabulary, 1)
        words.append(word)
            
    string_bigrams = bigrams(words.copy())
    for gram in string_bigrams: 
        word = gram[0] + " " + gram[1]
        if is_wordlength_exp and len(word) <= 2 and len(word) >= 9:
            continue
        tag_1 = nltk.pos_tag([gram[0]])[0][1][0].upper()
        tag_2 = nltk.pos_tag([gram[1]])[0][1][0].upper()
        if tag_1 == "N" and tag_2 == "N":
            add_to_vocabulary(word, local_vocabulary, 1)
            reduce_frequency(gram[0], local_vocabulary)
            reduce_frequency(gram[1], local_vocabulary)
    
    return local_vocabulary


def add_to_vocabulary(word, vocabulary, frequency) :
    if word in vocabulary :
        vocabulary[word] += frequency
    else :
        vocabulary[word] = frequency


def reduce_frequency(word, vocabulary):
    if word in vocabulary :
        vocabulary[word] -= 1
        if vocabulary[word] == 0:
            del vocabulary[word]


def get_lemmatized_words(words) :
    lemmatized_words = []
    for word in words :
        lemmatized_word = lemmatizer.lemmatize(word, get_pos_tag(word))
        lemmatized_words.append(lemmatized_word)
            
    return lemmatized_words


def create_vocabulary(posts, vocabulary, post_type_vocabulary) :
    for index, post in posts.iterrows():
        if index > 1000:
            break
        local_vocabulary = get_words_and_frequncy(post.Title)
        
        for word, count in local_vocabulary.items():
            lemmatized_word = lemmatizer.lemmatize(word, get_pos_tag(word))
            add_to_vocabulary(lemmatized_word, vocabulary, count)
            add_to_vocabulary(lemmatized_word, post_type_vocabulary, count)


def calculate_conditional_prob(values, word, post_type_vocab, post_type_total_words, vocabulary_size, delta):
    word_count = 0
    if word in post_type_vocab:
        word_count = post_type_vocab[word]
    conditional_prob = (word_count + delta) / (post_type_total_words + (vocabulary_size * delta))
    values.append(word_count)
    values.append(round(math.log10(conditional_prob),10))


def create_line(line_no, title, values):
    line = str(line_no) + "  "  + title
    for value in values :
        line += "  " + str(value)
    line += "\n"
    return line


def calculate_score(words, training_model, post_type_probability, index):
    post_type_score = round(math.log10(post_type_probability),10)
    for word in words:
        if word in training_model:
            post_type_score += training_model[word][index]
    return post_type_score


def predict_post_type(story_score, ask_score, show_score, poll_score):
    scores = [story_score, ask_score, show_score, poll_score]
    max_index = scores.index(max(scores))
    if max_index == 0:
        return hackerNews.STORY
    elif max_index == 1:
        return hackerNews.ASK_HN
    elif max_index == 2:
        return hackerNews.SHOW_HN
    else:
        return hackerNews.POLL


def get_stop_words():
    stop_words_file = open("Stopwords.txt","r")
    stop_word_list = stop_words_file.read().split()
    stop_words_file.close()
    return stop_word_list


def get_remove_words():
    remove_words_file = open("remove_words.txt","r")
    remove_word_list = remove_words_file.read().split()
    remove_words_file.close()
    return remove_word_list

print("Done")

Done


In [9]:
def read_training_data(hackerNews):
    csvdf = pd.read_csv('hn2018_2019.csv', delimiter=',', encoding='utf-8')

    data_2018 = csvdf[(csvdf["Created At"] >= "2018-01-01 00:00:00") & (csvdf["Created At"] <= "2018-12-31 23:59:59")]

    hackerNews.total_post = data_2018.size
    # print("Total Post: ", total_post)

    data_2018 = data_2018.groupby("Post Type")

    hackerNews.story_posts = data_2018.get_group(hackerNews.STORY)
    hackerNews.ask_posts = data_2018.get_group(hackerNews.ASK_HN)
    hackerNews.show_posts = data_2018.get_group(hackerNews.SHOW_HN)
    hackerNews.poll_posts = data_2018.get_group(hackerNews.POLL)

print("Done")

Done


In [7]:
def create_training_model(hackerNews, model_file_name, delta):
    # Create Vocabulary - Start
    print("\nCreating Vocabulary....")
    create_vocabulary(hackerNews.story_posts, hackerNews.vocabulary, hackerNews.story_post_vocabulary)
    create_vocabulary(hackerNews.ask_posts, hackerNews.vocabulary, hackerNews.ask_post_vocabulary)
    create_vocabulary(hackerNews.show_posts, hackerNews.vocabulary, hackerNews.show_post_vocabulary)
    create_vocabulary(hackerNews.poll_posts, hackerNews.vocabulary, hackerNews.poll_post_vocabulary)
    
    # Sort vocabulary alphabetically
    hackerNews.vocabulary = collections.OrderedDict(sorted(hackerNews.vocabulary.items(), key=lambda kv:kv[0]))
    
    vocabulary_file = open("vocabulary.txt", "w")
    for word, count in hackerNews.vocabulary.items():
        vocabulary_file.write(word+" "+str(count)+"\n")
    vocabulary_file.close()
    
    print("Vocabulary Created..!!!")
    # Create Vocabulary - End
    
    # Build Training Model - Start
    story_post_total_words = sum(hackerNews.story_post_vocabulary.values())
    ask_post_total_words = sum(hackerNews.ask_post_vocabulary.values())
    show_post_total_words = sum(hackerNews.show_post_vocabulary.values())
    poll_post_total_words = sum(hackerNews.poll_post_vocabulary.values())
    vocabulary_size = len(hackerNews.vocabulary)
    
    hackerNews.story_probability = hackerNews.story_posts.size / hackerNews.total_post
    hackerNews.ask_probability = hackerNews.ask_posts.size / hackerNews.total_post
    hackerNews.show_probability = hackerNews.show_posts.size / hackerNews.total_post
    hackerNews.poll_probability = hackerNews.poll_posts.size / hackerNews.total_post

    model_file = open(model_file_name, "w", encoding="utf-8")
    line_no = 0

    for word in hackerNews.vocabulary.keys():
        line_no += 1
        values = []
        calculate_conditional_prob(values, word, hackerNews.story_post_vocabulary, story_post_total_words, vocabulary_size, delta)
        calculate_conditional_prob(values, word, hackerNews.ask_post_vocabulary, ask_post_total_words, vocabulary_size, delta)
        calculate_conditional_prob(values, word, hackerNews.show_post_vocabulary, show_post_total_words, vocabulary_size, delta)
        calculate_conditional_prob(values, word, hackerNews.poll_post_vocabulary, poll_post_total_words, vocabulary_size, delta)
        hackerNews.training_model[word] = values

        model_file.write(create_line(line_no, word, values))

    model_file.close()
    
    print("Training Model Created..!!!")
    # Build Training Model - End

print("Done")

Done


In [13]:
def test_dataset(hackerNews, result_file_name):
    # Testing dataset - Start
    csvdf = pd.read_csv('hn2018_2019.csv', delimiter=',', encoding='utf-8')
    data_2019 = csvdf[(csvdf["Created At"] >= "2019-01-01 00:00:00") & (csvdf["Created At"] <= "2019-12-31 23:59:59")]

    baseline_result = open(result_file_name, "w", encoding="utf-8")
    line_no = 0

    for index, post in data_2019.iterrows():
        if line_no > 1000:
            break
        line_no += 1
        words = get_words_and_frequncy(post.Title)
        lemmatized_words = get_lemmatized_words(words.keys())

        story_score = calculate_score(lemmatized_words, hackerNews.training_model, hackerNews.story_probability, 1)
        ask_score = calculate_score(lemmatized_words, hackerNews.training_model, hackerNews.ask_probability, 3)
        show_score = calculate_score(lemmatized_words, hackerNews.training_model, hackerNews.show_probability, 5)
        poll_score = calculate_score(lemmatized_words, hackerNews.training_model, hackerNews.poll_probability, 7)

        predicted_post_type = predict_post_type(story_score, ask_score, show_score, poll_score)
        original_post_type = post["Post Type"]
        output = "right" if original_post_type == predicted_post_type else "wrong"
        values = [original_post_type, story_score, ask_score, show_score, poll_score, predicted_post_type, output]

        baseline_result.write(create_line(line_no, post.Title, values))


    baseline_result.close()
    
    print("Testing Dataset Completed..!!!")
    # Testing dataset - End

print("Done")

Done


In [None]:
print("Welcome..!!!")

DELTA = 0.5
choice = -1
remove_words = get_remove_words()
is_stopword_exp = is_wordlength_exp = False

while (choice != 0):
    print("\n1. Create Training Model")
    print("2. Basline Experiment")
    print("3. Stop-word Filtering Experiment")
    print("4. Word Length Filtering Experiment")
    print("5. Infrequent Word Filtering Experiment")
    print("6. Smoothing Experiment")
    print("0. Exit")
    choice = int(input("\nEnter your choice: "))
    
    if choice == 0:
        print("\nThank You..!!!")
        break
    elif choice == 1:
        hackerNews = HackerNews()
        is_stopword_exp = is_wordlength_exp = False
        read_training_data(hackerNews)
        create_training_model(hackerNews, "model-2018.txt", DELTA)
        
    elif choice == 2:
        hackerNews = HackerNews()
        is_stopword_exp = is_wordlength_exp = False
        read_training_data(hackerNews)
        create_training_model(hackerNews, "model-2018.txt", DELTA)
        test_dataset(hackerNews, "baseline-result.txt")
        
    elif choice == 3:
        hackerNews = HackerNews()
        is_stopword_exp = True 
        is_wordlength_exp = False
        read_training_data(hackerNews)
        stop_words = get_stop_words()
        create_training_model(hackerNews, "stopword-model.txt", DELTA)
        test_dataset(hackerNews, "stopword-result.txt")
        
    elif choice == 4:
        hackerNews = HackerNews()
        is_stopword_exp = False
        is_wordlength_exp = True
        read_training_data(hackerNews)
        create_training_model(hackerNews, "wordlength-model.txt", DELTA)
        test_dataset(hackerNews, "wordlength-result.txt.")
        
    elif choice == 5:
        is_stopword_exp = is_wordlength_exp = False
        
    elif choice == 6:
        is_stopword_exp = is_wordlength_exp = False
        

Welcome..!!!

1. Create Training Model
2. Basline Experiment
3. Stop-word Filtering Experiment
4. Word Length Filtering Experiment
5. Infrequent Word Filtering Experiment
6. Smoothing Experiment
0. Exit

Enter your choice: 3

Creating Vocabulary....
Vocabulary Created..!!!
Training Model Created..!!!
Testing Dataset Completed..!!!

1. Create Training Model
2. Basline Experiment
3. Stop-word Filtering Experiment
4. Word Length Filtering Experiment
5. Infrequent Word Filtering Experiment
6. Smoothing Experiment
0. Exit


In [11]:
# Lemmatize a Sentence with the appropriate POS tag
sentence = """Following mice attacks MySQL 10% HN: on UAE ASK-HR Dr. Ph.D. sagar's $300 etc. caring farmers were marching to Delhi for better living conditions. 
Delhi police on Tuesday fired water cannons and teargas shells at protesting farmers as they tried to 
break barricades with their cars, automobiles and tractors."""

sentence = "The Tech That Was Fixed in 2018 and the Tech That Still Needs Fixing"

print("\nOrignal Sentence: ")
print(sentence)

print("\nNew Sentence: ")
newSentence = sentence
# newSentence = get_lemmatized_words(sentence)
print(newSentence)
print("\nPunctuation",string.punctuation)

print("\n")
string_bigrams = bigrams(nltk.word_tokenize(sentence.lower()))
for gram in string_bigrams: 
    print(gram[0]+" "+gram[1])

print("\nSplit:",newSentence)
word = "enjoyed"
print(nltk.pos_tag([word])[0][1][0].upper())
print(get_pos_tag(word))
print(lemmatizer.lemmatize(word, get_pos_tag(word)))
# print(get_lemmatized_words(word))
print(wordnet.NOUN.upper())

print(round(10.87348434, 5))



Orignal Sentence: 
The Tech That Was Fixed in 2018 and the Tech That Still Needs Fixing

New Sentence: 
The Tech That Was Fixed in 2018 and the Tech That Still Needs Fixing

Punctuation !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


the tech
tech that
that was
was fixed
fixed in
in 2018
2018 and
and the
the tech
tech that
that still
still needs
needs fixing

Split: The Tech That Was Fixed in 2018 and the Tech That Still Needs Fixing
V
v
enjoy
N
10.87348


In [28]:
stop_words_file = open("Stopwords.txt","r")
stop_words = stop_words_file.read().split()
stop_words_file.close()

for word in stop_words:
    print(word)

!
"
'
“
”
‘
’
#
$
%
&
(
)
*
+
,
-
–
.
/
:
;
<
=
>
?
@
[
\
]
^
_
`
{
|
}
~
a
b
d
e
f
g
h
i
j
k
l
m
n
o
p
q
s
t
u
v
w
x
y
z
