In [1]:
import numpy as np 
import re
from collections import defaultdict, Counter
import math
import copy

In [12]:
def train_bayes(train_file):
    file = open(train_file, 'r')

    tweets = file.readlines()
    tweets = [re.sub('\n', '', tweet) for tweet in tweets]

    targets = [tweet.split()[0] for tweet in tweets]
    text = [tweet.lower().split()[1:] for tweet in tweets]

    loc_word_dict = {}
    for i in range(len(targets)):
        for word in text[i]:
            if targets[i] in loc_word_dict.keys():
                if word in loc_word_dict[targets[i]].keys():
                    loc_word_dict[targets[i]][word] += 1
                else:
                    loc_word_dict[targets[i]][word] = 1
            else:
                loc_word_dict[targets[i]] = {word: 1}

    word_loc_dict = {}
    for i in range(len(targets)):
        for word in text[i]:
            if word in word_loc_dict.keys():
                if targets[i] in word_loc_dict[word].keys():
                    word_loc_dict[word][targets[i]] += 1
                else:
                    word_loc_dict[word][targets[i]] = 1
            else:
                word_loc_dict[word] = {targets[i]: 1}

    p_L = Counter(targets)

    for value in loc_word_dict.values():
        total = sum(value.values())
        for key in value.keys():
            value[key] = value[key] / total

    word_loc_dict2 = {}
    for key in word_loc_dict.keys():
        if sum(word_loc_dict[key].values()) >= 5:
            word_loc_dict2[key] = word_loc_dict[key]

    for value in word_loc_dict2.values():
        total = sum(value.values())
        for key in value.keys():
            value[key] = [value[key] / total, value[key]]
    
    total = sum(p_L.values())
    for key in p_L.keys():
        p_L[key] = p_L[key] / total

    top_words = {}
    for loc in set(targets):
        words = []
        for word in word_loc_dict2.keys():
            if loc in word_loc_dict2[word].keys():
                words.append([word, word_loc_dict2[word][loc]])
        words = sorted(words, key = lambda x: (x[1][0], x[1][1]), reverse = True)[0:5]
        words = [word[0] for word in words]
        top_words[loc] = words

    for key in top_words.keys():
            ws = ', '.join(top_words[key])
            out = f'The top 5 words for {key} are: {ws}'
            print(out)
    

    return loc_word_dict, p_L

In [8]:
def read_test_file(test_file):
    file = open(test_file, 'r')

    tweets = file.readlines()
    tweets = [re.sub('\n', '', tweet) for tweet in tweets]

    targets = [tweet.split()[0] for tweet in tweets]
    text = [' '.join(tweet.split()[1:]) for tweet in tweets]

    return (targets, text)

In [9]:

def test_one_target(test_tweet, target, loc_word_dict, p_L):
    tokenized_tweet = test_tweet.lower().split()
    score = math.log(p_L[target])
    for token in tokenized_tweet:
        if token in loc_word_dict[target].keys():
            score += math.log(loc_word_dict[target][token])
        else:
            score += math.log(1 / 100000)
    return (target, score)

In [10]:
def bayes_test(test_tweet, targets, loc_word_dict, p_L):
    best_score = -1000000000000000
    best_target = ''
    for loc in targets:
        pos_target, pos_score = test_one_target(test_tweet, loc, loc_word_dict, p_L)
        if pos_score > best_score:
            best_score = pos_score
            best_target = pos_target
    return best_target

In [13]:
loc_word_dict, p_L = train_bayes('tweets.train.clean.txt')

The top 5 words for Manhattan,_NY are: ny), #newyork,, (#newyork,, ny?, cleared:
The top 5 words for Atlanta,_GA are: #atlanta,, atlanta,, georgia, (#atlanta,, ga?
The top 5 words for Los_Angeles,_CA are: angeles,, #losangeles,, dodger, (#losangeles,, #dodgers
The top 5 words for Toronto,_Ontario are: toronto,, trucks), #toronto, #toronto,, b/w
The top 5 words for Philadelphia,_PA are: philadelphia,, #philadelphia,, pa), philadelphia, phillies
The top 5 words for Chicago,_IL are: chicago,, #chicago,, illinois, (#chicago,, il?
The top 5 words for Boston,_MA are: #boston,, ma), massachusetts, ma?, (#boston,
The top 5 words for Orlando,_FL are: #orlpol, #opd, #orlando,, fl, orlando,
The top 5 words for San_Francisco,_CA are: francisco,, #sanfrancisco,, (#sanfrancisco,, #sf, fran
The top 5 words for Washington,_DC are: washington,, #washington,, dc), d.c., (#washington,
The top 5 words for San_Diego,_CA are: diego,, (#sandiego,, petco, jolla, #seaworld
The top 5 words for Houston,_TX are: 

In [14]:
test_targets, test_text = read_test_file('tweets.test1.clean.txt')

In [15]:
correct = 0
total = 0
predictions = []
for i in range(len(test_text)):
    prediction = bayes_test(test_text[i], set(test_targets), loc_word_dict, p_L)
    predictions.append((prediction, test_targets[i], test_text[i]))
    total += 1
    if prediction == test_targets[i]:
        correct += 1
    
score = correct / total
print(score)

0.632
