In [129]:
# -------------------------------------------------------
# Project 2
# Sagar Shivaji Vetal - 40071979
# Himanshu Kohli - 40070839
# For COMP 6721 Section FJ – Fall 2019
# -------------------------------------------------------

import pandas as pd
import nltk
import string
import collections
import math
import numpy as np
import matplotlib.pyplot as plt
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk import bigrams
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report

# Init Lemmatizer
lemmatizer = WordNetLemmatizer()

is_stopword_exp = False
is_wordlength_exp = False
is_infrequentword_exp = False
remove_words = []
remove_symbols = []
stop_words = []
x_values = []
accuracy_list = []


class HackerNews:
    """
    This class is used to hold all posts of year 2018, complete vocabulary,
    post type wise vocabulary, post type constants, training model and pobability of each post type.
    """
    
    def __init__(self):
        """
        This constructor initializes data members to default value.
        """
        self.STORY = "story"
        self.ASK_HN = "ask_hn"
        self.SHOW_HN = "show_hn"
        self.POLL = "poll"
        self.total_post = 0
        self.story_posts = []
        self.ask_posts = []
        self.show_posts = []
        self.poll_posts = []
        self.vocabulary = dict()
        self.story_post_vocabulary = dict()
        self.ask_post_vocabulary = dict()
        self.show_post_vocabulary = dict()
        self.poll_post_vocabulary = dict()
        self.training_model = dict()
        self.story_probability = 0.0
        self.ask_probability = 0.0
        self.show_probability = 0.0
        self.poll_probability = 0.0



def get_pos_tag(word) :
    """
    This method is used to find the POS tag of given word.
    
    Parameters
    ----------
    word : string
        Word of type string
    
    Returns
    -------
    pos_tag : string
        It returns the POS tag of given word
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def clear_symbols(word) :
    """
    This method cleans the given word by removing special characters.
    
    Parameters
    ----------
    word : string
        Word of type string
    
    Returns
    -------
    word : string
        It returns cleaned word after removing special characters.
    """
    post_types = ["ask-hn", "ask_hn", "show-hn", "show_hn"]
    if word in post_types:
        return word
    for symbol in remove_symbols:
         word = word.strip(symbol)
    return word.strip()


def get_words_and_frequncy(sentence):
    """
    This method reads the sentence, tokenize it, remove sepcial character and words.
    It creates bi-gram and builds the vocabulary only for given sentence. 
    It also does word filtering based on the flag of experiment type.
    
    Parameters
    ----------
    sentence : string
        It is a post title from training/testing data
    
    Returns
    -------
    local_vocabulary : dictionary
        It returns the dictionary of vocabulary only for given sentence.
    """
    local_vocabulary = dict()
    words = []
    
    for word in nltk.word_tokenize(sentence.lower()) :
        word = clear_symbols(word)
        if len(word) == 0 :
            continue
        if word in remove_words or word in remove_symbols:
            continue
        if is_stopword_exp and word in stop_words :
            continue
        if is_wordlength_exp and (len(word) <= 2 or len(word) >= 9):
            continue
        add_to_vocabulary(word, local_vocabulary, 1)
        words.append(word)
            
    string_bigrams = bigrams(words.copy())
    for gram in string_bigrams: 
        word = gram[0] + " " + gram[1]
        if is_wordlength_exp and len(word) <= 2 and len(word) >= 9:
            continue
        tag_1 = nltk.pos_tag([gram[0]])[0][1][0].upper()
        tag_2 = nltk.pos_tag([gram[1]])[0][1][0].upper()
        if tag_1 == "N" and tag_2 == "N":
            add_to_vocabulary(word, local_vocabulary, 1)
            reduce_frequency(gram[0], local_vocabulary)
            reduce_frequency(gram[1], local_vocabulary)
    
    return local_vocabulary


def add_to_vocabulary(word, vocabulary, frequency) :
    """
    This method adds the given word and its frequency to given vocabulary.
    
    Parameters
    ----------
    word : string
        It is a word extracted from post title.
    vocabulary : dictionary
        It is a vocabulary created from training data
    frequency : integer
        It is a number of occurances of given word
    """
    if word in vocabulary :
        vocabulary[word] += frequency
    else :
        vocabulary[word] = frequency


def reduce_frequency(word, vocabulary):
    """
    This method reduces the frequency of given word from given vocabulary,
    and remove the word when frequency reaches to 0.
    
    Parameters
    ----------
    word : string
        It is a word extracted from post title.
    vocabulary : dictionary
        It is a vocabulary created from training data
    """
    if word in vocabulary :
        vocabulary[word] -= 1
        if vocabulary[word] == 0:
            del vocabulary[word]


def get_lemmatized_words(words) :
    """
    This method lemmatizes the given list of words and returns it.
    
    Parameters
    ----------
    words : string array
        It is a list of words extracted from post title.
    
    Returns
    -------
    lemmatized_words : string array
        It is a list of lemmatized words.
    """
    lemmatized_words = []
    for word in words :
        lemmatized_word = lemmatizer.lemmatize(word, get_pos_tag(word))
        lemmatized_words.append(lemmatized_word)       
    return lemmatized_words


def create_vocabulary(posts, vocabulary, post_type_vocabulary) :
    """
    This method reads the post title from given posts, extracts the words from post, 
    cleans those words, also creates bi-grams, count their occurances and create the vocabulary.
    
    Parameters
    ----------
    posts : string array
        It is a list of posts extracted from training data.
    vocabulary : dictionary
        It is a complete vocabulary created from training data.
    post_type_vocabulary : dictionary
        It is a vocabulary created from training data for sepcific type of post.
    """
    for index, post in posts.iterrows():
        local_vocabulary = get_words_and_frequncy(post.Title)
        for word, count in local_vocabulary.items():
            lemmatized_word = lemmatizer.lemmatize(word, get_pos_tag(word))
            add_to_vocabulary(lemmatized_word, vocabulary, count)
            add_to_vocabulary(lemmatized_word, post_type_vocabulary, count)


def calculate_conditional_prob(values, word, post_type_vocab, post_type_total_words, vocabulary_size, delta):
    """
    This method calculates the conditional probabilty of given word for given post_type.
    
    Parameters
    ----------
    values : float array
        It holds the frequency and conditional probabilty of given word for given post type.
    word : string
        It is a word of which conditional probabilty is calculated.
    post_type_vocab : dictionary
        It is a vocabulary created from training data for sepcific type of post.
    post_type_total_words : integer
        Total number of words in given post type vocabulary.
    vocabulary_size : integer
        Total number of words in vocabulary.
    delta : float
        It is a smoothing value used while calculating conditional probabilty.
    """
    word_count = 0
    if word in post_type_vocab:
        word_count = post_type_vocab[word]
    conditional_prob = (word_count + delta) / (post_type_total_words + (vocabulary_size * delta))
    values.append(word_count)
    if conditional_prob != 0:
        conditional_prob = round(math.log10(conditional_prob),10)
    values.append(conditional_prob)


def create_line(line_no, title, values):
    """
    This method creates a line using given parameters to write into output file.
    
    Parameters
    ----------
    line_no : integer
        It is a line number in output file.
    title : string
        It is post title.
    values : flaot array
        It holds the frequency and conditional probabilty of a word for all post type.
    
    Returns
    -------
    line : string
        A line created using given parameters to write into output file.
    """
    line = str(line_no) + "  "  + title
    for value in values :
        line += "  " + str(value)
    line += "\n"
    return line


def calculate_score(words, training_model, post_type_probability, index):
    """
    This method calculates the score for each post type using given list of word
    and probability of given post type with the use of training model.
    
    Parameters
    ----------
    words : string array
        It is a list of words extracted from post title of testing data.
    training_model : dictionary
        It is a training model created for classification.
    post_type_probability : float
        It is probability of given post type.
    index : index
        It is an index of conditional probabilty of a word in training model.
    
    Returns
    -------
    post_type_score : flaot
        A score of title for given post type.
    """
    post_type_score = round(math.log10(post_type_probability),10)
    for word in words:
        if word in training_model:
            post_type_score += training_model[word][index]
    return post_type_score


def predict_post_type(story_score, ask_score, show_score, poll_score):
    """
    This method predicts post type by comparing the score of each post type.
    
    Parameters
    ----------
    story_score : float
        A score of title for story post type.
    ask_score : float
        A score of title for ask_hn post type.
    show_score : float
        A score of title for show_hn post type.
    poll_score : float
        A score of title for poll post type.
    
    Returns
    -------
    post_type : string
        A post type predicted for a post title.
    """
    scores = [story_score, ask_score, show_score, poll_score]
    max_index = scores.index(max(scores))
    if max_index == 0:
        return hackerNews.STORY
    elif max_index == 1:
        return hackerNews.ASK_HN
    elif max_index == 2:
        return hackerNews.SHOW_HN
    else:
        return hackerNews.POLL


def get_stop_words():
    """
    This method reads stopwords file and returns the list of stopwords.
    
    Returns
    -------
    stop_word_list : string array
        A list of stopwords.
    """
    stop_words_file = open("Stopwords.txt","r", encoding="ISO-8859-1")
    stop_word_list = stop_words_file.read().split()
    stop_words_file.close()
    return stop_word_list


def get_remove_words():
    """
    This method reads remove_words file and returns the list of removewords.
    
    Returns
    -------
    remove_word_list : string array
        A list of removewords.
    """
    remove_words_file = open("remove_words.txt","r", encoding="ISO-8859-1")
    remove_word_list = remove_words_file.read().split()
    remove_words_file.close()
    return remove_word_list


def get_remove_symbols():
    """
    This method reads remove_symbols file and returns the list of special characters to be removed.
    
    Returns
    -------
    remove_word_list : string array
        A list of special characters to be removed.
    """
    remove_symbols_file = open("remove_symbols.txt","r", encoding="ISO-8859-1")
    remove_symbols_list = remove_symbols_file.read().split()
    remove_symbols_file.close()
    return remove_symbols_list


def get_binary_list(post_type_list, post_type):
    """
    This method converts the values of given list into binary values i.e 0 and 1.
    
    Parameters
    ----------
    post_type_list : string array
        It is a list of post type of posts.
    post_type : string
        It is a type of post.
        
    Returns
    -------
    binary_list : integer array
        A list of binary values for given list of post type.
    """
    binary_list = []
    for current_post_typ in post_type_list:
        if post_type == current_post_typ:
            binary_list.append(1)
        else:
            binary_list.append(0)
    return binary_list


def calculate_performance(test_post_types, predicted_post_types, post_type):
    """
    This method calculates and prints the performance of classification for given post type.
    
    Parameters
    ----------
    test_post_types : string array
        It is a list of actual post type of posts.
    predicted_post_types : string array
        It is a list of predicted post type of posts.
    post_type : string
        It is a type of post.
    """
    test_set = get_binary_list(test_post_types, post_type)
    prediction_set = get_binary_list(predicted_post_types, post_type)
    cm = confusion_matrix(test_set, prediction_set)
    TP = cm[1][1]
    FP = cm[0][1]
    FN = cm[1][0]
    TN = cm[0][0]
    
    accuracy = (TP + TN) / (TP + FP + FN + TN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_measure = (2 * precision * recall) / (precision + recall)
    
#     print("\n===============================================")
#     print("Post Type: " , post_type.upper())
#     print("Confusion Matrix: ")
#     print(cm)
#     print("Accuracy:", accuracy)
#     print("Precision:", precision)
#     print("Recall:", recall)
#     print("F1-measure:", f1_measure)


def check_performance(hackerNews, test_post_types, predicted_post_types):
    """
    This method calculates the performance of classification for each post type.
    
    Parameters
    ----------
    hackerNews : HackerNews object
        An object of HackerNews class.
    test_post_types : string array
        It is a list of actual post type of posts.
    predicted_post_types : string array
        It is a list of predicted post type of posts.
    """
    calculate_performance(test_post_types, predicted_post_types, hackerNews.STORY)
    calculate_performance(test_post_types, predicted_post_types, hackerNews.ASK_HN)
    calculate_performance(test_post_types, predicted_post_types, hackerNews.SHOW_HN)
    calculate_performance(test_post_types, predicted_post_types, hackerNews.POLL)
    print("\n===============================================")
    cm = confusion_matrix(test_post_types, predicted_post_types, labels=[hackerNews.STORY, hackerNews.ASK_HN, hackerNews.SHOW_HN, hackerNews.POLL])
    print("Confusion Matrix: ")
    print(cm)
    experiment_accuracy = accuracy_score(test_post_types, predicted_post_types)
    accuracy_list.append(experiment_accuracy)
    print("Accuracy of experiment: ", experiment_accuracy)
    print("Report: ")
    print(classification_report(test_post_types, predicted_post_types))
    
    
def plot_performance(x_values, y_values, x_title):
    """
    This method polts the performance of an experiment using x and y values.
    
    Parameters
    ----------
    x_values : array
        A list number of words left in vocabulary or list delta values.
    y_values : array
        A list of accuracy of experiment.
    x_title : string
        It is a title of x axis.
    """
    plt.plot(x_values, y_values) 
    plt.xlabel(x_title) 
    plt.ylabel("Accuracy") 
    plt.show() 



In [130]:
def read_training_data(hackerNews):
    """
    This method reads the training data from given input file 
    and groups all posts by post type.
    
    Parameters
    ----------
    hackerNews : HackerNews object
        An object of HackerNews class.
    """
    csvdf = pd.read_csv('hn2018_2019.csv', delimiter=',', encoding='utf-8')

    data_2018 = csvdf[(csvdf["Created At"] >= "2018-01-01 00:00:00") & (csvdf["Created At"] <= "2018-12-31 23:59:59")]

    hackerNews.total_post = data_2018.shape[0]
    # print("Total Post: ", total_post)
    data_2018 = data_2018.groupby("Post Type")
    no_of_post = 500
    hackerNews.story_posts = data_2018.get_group(hackerNews.STORY).head(no_of_post)
    hackerNews.ask_posts = data_2018.get_group(hackerNews.ASK_HN).head(no_of_post)
    hackerNews.show_posts = data_2018.get_group(hackerNews.SHOW_HN).head(no_of_post)
    hackerNews.poll_posts = data_2018.get_group(hackerNews.POLL).head(no_of_post)


In [131]:
def build_vocabulary(hackerNews):
    """
    This method creates the complete vocabulary and writes into file.
    Also, it creates post type wise vocabulary.
    
    Parameters
    ----------
    hackerNews : HackerNews object
        An object of HackerNews class.
    """ 
    print("\nCreating Vocabulary....")
    hackerNews.vocabulary.clear()
    hackerNews.story_post_vocabulary.clear()
    hackerNews.ask_post_vocabulary.clear()
    hackerNews.show_post_vocabulary.clear()
    hackerNews.poll_post_vocabulary.clear()
    
    create_vocabulary(hackerNews.story_posts, hackerNews.vocabulary, hackerNews.story_post_vocabulary)
    create_vocabulary(hackerNews.ask_posts, hackerNews.vocabulary, hackerNews.ask_post_vocabulary)
    create_vocabulary(hackerNews.show_posts, hackerNews.vocabulary, hackerNews.show_post_vocabulary)
    create_vocabulary(hackerNews.poll_posts, hackerNews.vocabulary, hackerNews.poll_post_vocabulary)
    
    words_tobe_removed = []
    if is_infrequentword_exp and is_threshold_percent :
        hackerNews.vocabulary = collections.OrderedDict(sorted(hackerNews.vocabulary.items(), key=lambda kv:kv[1], reverse=True))
        no_of_words_to_remove = int(np.ceil(len(hackerNews.vocabulary) * (threshold / 100)))
        for word in hackerNews.vocabulary.keys():
            if no_of_words_to_remove == 0:
                break;
            else:
                words_tobe_removed.append(word)
                no_of_words_to_remove -= 1
        
    elif is_infrequentword_exp and not is_threshold_percent :
        for word, count in hackerNews.vocabulary.items():
            if count <= threshold:
                words_tobe_removed.append(word)
        
    for word in words_tobe_removed:
        del hackerNews.vocabulary[word]
    
    # Sort vocabulary alphabetically
    hackerNews.vocabulary = collections.OrderedDict(sorted(hackerNews.vocabulary.items(), key=lambda kv:kv[0]))
    
    vocabulary_file = open("vocabulary.txt", "w", encoding="utf-8")
    for word, count in hackerNews.vocabulary.items():
        vocabulary_file.write(word+"\n")
    vocabulary_file.close()
    
    print("Vocabulary Created..!!!")


def create_training_model(hackerNews, model_file_name, delta): 
    """
    This method creates the training model and writes into file.
    
    Parameters
    ----------
    hackerNews : HackerNews object
        An object of HackerNews class.
    model_file_name : string
        A name of output file of training model.
    delta : float
        It is a smoothing value used while calculating conditional probabilty.
    """ 
    print("\nCreating Training Model....")
    story_post_total_words = sum(hackerNews.story_post_vocabulary.values())
    ask_post_total_words = sum(hackerNews.ask_post_vocabulary.values())
    show_post_total_words = sum(hackerNews.show_post_vocabulary.values())
    poll_post_total_words = sum(hackerNews.poll_post_vocabulary.values())
    vocabulary_size = len(hackerNews.vocabulary)
    
    hackerNews.story_probability = hackerNews.story_posts.shape[0] / hackerNews.total_post
    hackerNews.ask_probability = hackerNews.ask_posts.shape[0] / hackerNews.total_post
    hackerNews.show_probability = hackerNews.show_posts.shape[0] / hackerNews.total_post
    hackerNews.poll_probability = hackerNews.poll_posts.shape[0] / hackerNews.total_post

    model_file = open(model_file_name, "w", encoding="utf-8")
    line_no = 0
    hackerNews.training_model.clear()

    for word in hackerNews.vocabulary.keys():
        line_no += 1
        values = []
        calculate_conditional_prob(values, word, hackerNews.story_post_vocabulary, story_post_total_words, vocabulary_size, delta)
        calculate_conditional_prob(values, word, hackerNews.ask_post_vocabulary, ask_post_total_words, vocabulary_size, delta)
        calculate_conditional_prob(values, word, hackerNews.show_post_vocabulary, show_post_total_words, vocabulary_size, delta)
        calculate_conditional_prob(values, word, hackerNews.poll_post_vocabulary, poll_post_total_words, vocabulary_size, delta)
        hackerNews.training_model[word] = values

        model_file.write(create_line(line_no, word, values))

    model_file.close()
    
    print("Training Model Created..!!!")


In [132]:
def test_dataset(hackerNews, result_file_name):
    """
    This method tests the testing dataset using training model, predicts the output and writes into file.
    
    Parameters
    ----------
    hackerNews : HackerNews object
        An object of HackerNews class.
    result_file_name : string
        A name of output file of testing.
    """ 
    print("\nTesting Dataset using training model....")
    csvdf = pd.read_csv('hn2018_2019.csv', delimiter=',', encoding='utf-8')
    data_2019 = csvdf[(csvdf["Created At"] >= "2019-01-01 00:00:00") & (csvdf["Created At"] <= "2019-12-31 23:59:59")]
    data_2019 = data_2019.groupby("Post Type").head(1000)
    
    baseline_result = open(result_file_name, "w", encoding="utf-8")
    line_no = 0
    test_post_types = []
    predicted_post_types = []

    for index, post in data_2019.iterrows():
        line_no += 1
        words = get_words_and_frequncy(post.Title)
        lemmatized_words = get_lemmatized_words(words.keys())

        story_score = calculate_score(lemmatized_words, hackerNews.training_model, hackerNews.story_probability, 1)
        ask_score = calculate_score(lemmatized_words, hackerNews.training_model, hackerNews.ask_probability, 3)
        show_score = calculate_score(lemmatized_words, hackerNews.training_model, hackerNews.show_probability, 5)
        poll_score = calculate_score(lemmatized_words, hackerNews.training_model, hackerNews.poll_probability, 7)

        predicted_post_type = predict_post_type(story_score, ask_score, show_score, poll_score)
        original_post_type = post["Post Type"]
        output = "right" if original_post_type == predicted_post_type else "wrong"
        values = [original_post_type, story_score, ask_score, show_score, poll_score, predicted_post_type, output]

        baseline_result.write(create_line(line_no, post.Title, values))
        test_post_types.append(original_post_type)
        predicted_post_types.append(predicted_post_type)

    baseline_result.close()
    
    print("Testing Dataset Completed..!!!")
    return test_post_types, predicted_post_types



In [133]:
print("Welcome..!!!")

choice = -1
remove_words = get_remove_words()
remove_symbols = get_remove_symbols()
is_stopword_exp = is_wordlength_exp = is_infrequentword_exp = False
threshold = 0
is_threshold_percent = False

while (choice != 0):
    DELTA = 0.5
    accuracy_list.clear()
    x_values.clear()
    print("\n1. Create Training Model")
    print("2. Basline Experiment")
    print("3. Stop-word Filtering Experiment")
    print("4. Word Length Filtering Experiment")
    print("5. Infrequent Word Filtering Experiment")
    print("6. Smoothing Experiment")
    print("0. Exit")
    choice = int(input("\nEnter your choice: "))
    
    if choice == 0:
        print("\nThank You..!!!")
        break
    elif choice == 1:
        hackerNews = HackerNews()
        is_stopword_exp = is_wordlength_exp = False
        read_training_data(hackerNews)
        build_vocabulary(hackerNews)
        create_training_model(hackerNews, "model-2018.txt", DELTA)
        
    elif choice == 2:
        hackerNews = HackerNews()
        is_stopword_exp = is_wordlength_exp = is_infrequentword_exp = False
        read_training_data(hackerNews)
        build_vocabulary(hackerNews)
        create_training_model(hackerNews, "model-2018.txt", DELTA)
        test_post_types, predicted_post_types = test_dataset(hackerNews, "baseline-result.txt")
        check_performance(hackerNews, test_post_types, predicted_post_types)
        
    elif choice == 3:
        hackerNews = HackerNews()
        is_stopword_exp = True 
        is_wordlength_exp = is_infrequentword_exp = False
        read_training_data(hackerNews)
        stop_words = get_stop_words()
        build_vocabulary(hackerNews)
        create_training_model(hackerNews, "stopword-model.txt", DELTA)
        test_post_types, predicted_post_types = test_dataset(hackerNews, "stopword-result.txt")
        check_performance(hackerNews, test_post_types, predicted_post_types)
        
    elif choice == 4:
        hackerNews = HackerNews()
        is_stopword_exp = is_infrequentword_exp = False
        is_wordlength_exp = True
        read_training_data(hackerNews)
        build_vocabulary(hackerNews)
        create_training_model(hackerNews, "wordlength-model.txt", DELTA)
        test_post_types, predicted_post_types = test_dataset(hackerNews, "wordlength-result.txt")
        check_performance(hackerNews, test_post_types, predicted_post_types)
        
    elif choice == 5:
        hackerNews = HackerNews()
        is_stopword_exp = is_wordlength_exp = False
        is_infrequentword_exp = True
        read_training_data(hackerNews)
        
        is_threshold_percent = False
        thresholds = [1, 5, 10, 15, 20]
        for i in thresholds:
            threshold = i
            build_vocabulary(hackerNews)
            create_training_model(hackerNews, "model-2018.txt", DELTA)
            test_post_types, predicted_post_types = test_dataset(hackerNews, "baseline-result.txt")
            check_performance(hackerNews, test_post_types, predicted_post_types)
            x_values.append(str(len(hackerNews.vocabulary)))
        
        plot_performance(x_values, accuracy_list, "Word in Vocabulary")
        
        x_values.clear()
        accuracy_list.clear()
        is_threshold_percent = True
        thresholds = [5, 10, 15, 20, 25]
        for i in thresholds:
            threshold = i
            build_vocabulary(hackerNews)
            create_training_model(hackerNews, "model-2018.txt", DELTA)
            test_post_types, predicted_post_types = test_dataset(hackerNews, "baseline-result.txt")
            check_performance(hackerNews, test_post_types, predicted_post_types)
            x_values.append(str(len(hackerNews.vocabulary)))
        
        plot_performance(x_values, accuracy_list, "Word in Vocabulary")
        
    elif choice == 6:
        hackerNews = HackerNews()
        is_stopword_exp = is_wordlength_exp = is_infrequentword_exp = False
        read_training_data(hackerNews)
        build_vocabulary(hackerNews)
        DELTA = 0
        while DELTA <= 1:
            x_values.append(str(DELTA))
            create_training_model(hackerNews, "model-2018.txt", DELTA)
            test_post_types, predicted_post_types = test_dataset(hackerNews, "baseline-result.txt")
            check_performance(hackerNews, test_post_types, predicted_post_types)
            DELTA += 0.1
        plot_performance(x_values, accuracy_list, "Delta Value")
        

Welcome..!!!

1. Create Training Model
2. Basline Experiment
3. Stop-word Filtering Experiment
4. Word Length Filtering Experiment
5. Infrequent Word Filtering Experiment
6. Smoothing Experiment
0. Exit

Enter your choice: 0

Thank You..!!!
