In [1]:
""" This program uses the Naive Bayes Classifier to classify the validation set 
after training on the training set. """
import string
# Extract actual necessary words from the tweet
def extract_words(tweet_words):
    words = []
    alpha_lower = string.ascii_lowercase
    alpha_upper = string.ascii_uppercase
    numbers = [str(n) for n in range(10)]
    for word in tweet_words:
            cur_word = ''
            for c in word:
                if(c not in alpha_lower) and (c not in alpha_upper) and (c not in numbers):
                    if len(cur_word) >= 2:
                        words.append(cur_word.lower())
                    cur_word = ''
                    continue
                cur_word += c
            if len(cur_word) >= 2:
                words.append(cur_word.lower())
    return words

In [2]:
# Get Training Data from the input file
def get_training_data():
        f = open('E:/NB/training.txt', 'r')
        training_data = []
        for l in f.readlines():
                l = l.strip()
                tweet_details = l.split()
                tweet_id = tweet_details[0]
                tweet_label = tweet_details[1]
                tweet_words = extract_words(tweet_details[2:])
                training_data.append([tweet_id, tweet_label, tweet_words])

        f.close()

        return training_data


In [3]:
# Get Test Data from the input file
def get_test_data():
    f = open('E:/NB/test.txt', 'r')
    validation_data = []
    for l in f.readlines():
        l = l.strip()
        tweet_details = l.split(' ')
        tweet_id = tweet_details[0]
        tweet_words = extract_words(tweet_details[1:])
        validation_data.append([tweet_id, '', tweet_words])
    f.close()
    return validation_data

In [4]:
# Get list of words in the training data
def get_words(training_data):
    words = []
    for data in training_data:
        words.extend(data[2])
    return list(set(words))

In [5]:
# Get Probability of each word in the training data
# If label is specified, find the probability of each word in the corresponding labelled tweets only
def get_word_prob(training_data, label = None):
    words = get_words(training_data)
    freq = {}
    for word in words:
        freq[word] = 1
    total_count = 0
    for data in training_data:
        if data[1] == label or label == None:
            total_count += len(data[2])
            for word in data[2]:
                freq[word] += 1
    prob = {}
    for word in freq.keys():
        prob[word] = freq[word]*1.0/total_count
    return prob

In [6]:
# Get Probability of given label
def get_label_count(training_data, label):
    count = 0
    total_count = 0
    for data in training_data:
        total_count += 1
        if data[1] == label:
            count += 1
    return count*1.0/total_count


In [7]:
# Label the test data given the trained parameters Using Naive Bayes Model
def label_data(test_data, sports_word_prob, politics_word_prob, sports_prob, politics_prob):
    labels = []
    for data in test_data:
        data_prob_sports = sports_prob
        data_prob_politics = politics_prob

        for word in data[2]:
            if word in sports_word_prob:
                data_prob_sports *= sports_word_prob[word]
                data_prob_politics *= politics_word_prob[word]
            else:
                continue
        if data_prob_sports >= data_prob_politics:
            labels.append([data[0], 'Sports', data_prob_sports, data_prob_politics])
        else:
            labels.append([data[0], 'Politics', data_prob_sports, data_prob_politics])
    return labels


In [8]:
# Print the labelled test data
def print_labelled_data(labels):
    f_out = open('E:/NB/test_labelled.txt', 'w')
    for [tweet_id, label, prob_sports, prob_politics] in labels:
        f_out.write('%s %s\n' % (tweet_id, label))
    f_out.close()


In [9]:
# Get the training and test data
training_data = get_training_data()
test_data = get_test_data()


In [10]:
# Get the probabilities of each word overall and in the two labels
word_prob = get_word_prob(training_data)
sports_word_prob = get_word_prob(training_data, 'Sports')
politics_word_prob = get_word_prob(training_data, 'Politics')


In [11]:
# Get the probability of each label
sports_prob = get_label_count(training_data, 'Sports')
politics_prob = get_label_count(training_data, 'Politics')


In [12]:
# Normalise for stop words
for (word, prob) in word_prob.items():
    sports_word_prob[word] /= prob
    politics_word_prob[word] /= prob


In [13]:
# Label the test data and print it
test_labels = label_data(test_data, sports_word_prob, politics_word_prob, sports_prob, politics_prob)
print_labelled_data(test_labels)

In [14]:
print_labelled_data(test_labels)

In [15]:
test_labels 

[['301733794770190336', 'Politics', 5.886031896617191e-05, 145.53504552225718],
 ['301576909517619200', 'Politics', 1.5155765912253117e-07, 5.742245943730253],
 ['305052368305790976', 'Sports', 1.0615393100517754, 0.13425600676504174],
 ['293345092176064513',
  'Politics',
  2.4222144096209658e-05,
  19.191958766754308],
 ['305057161682227200', 'Sports', 1.8217052115735377, 0.19447646368689678],
 ['286543227178328066', 'Politics', 0.3106393456692338, 0.5351780501960383],
 ['306798825249583104', 'Sports', 5.452066619279484, 0.0019592880071467415],
 ['306552585316339712', 'Sports', 55.99884950440666, 0.0009161901442120648],
 ['305905054609702912', 'Sports', 0.9378040927927889, 0.673514635519341],
 ['305717928572108800', 'Sports', 23.799023705643272, 0.011487551670786843],
 ['305897838569730048', 'Sports', 3.804115069055057, 0.07631752595623936],
 ['301037462048874496', 'Sports', 1.7651001367842334, 9.158586466599365e-06],
 ['301691175616774144', 'Politics', 2.3969135063634025e-06, 56.594