# Instruction
You can simply run the following cell.

But before running, please make sure that this file is located in a directory that contains *train_set_y.csv_*and *train_set_x.csv*.

Also, please change the variable value of *path_to_test_set_x* to the path to the test set csv file. 
By default, it is set to *test_set_x.csv*.

The final result file is named *test_set_y_final.csv*.

Note: we found out that adjusting encoding of the text does not really help the accuracy but it drops the accuracy. 
Thus, we are using orthography filtering preprocessing only. 

In [8]:
import pandas as pd
import sys

def learn_y(train_set_xy):
    # define a probability distribution data structure for theta_Y
    est_y = {}
    # compute probability mass function
    n = train_set_xy['Category'].count()
    y_counts = train_set_xy['Category'].value_counts()
    for i in range(5):
        est_y[i] = y_counts[i] / n
    return est_y

def learn_xy(train_set_xy):
    xy_occurances = {}
    for index, row in train_set_xy.iterrows():
        y = row['Category']
        if y not in xy_occurances:
            xy_occurances[y] = {}
            xy_occurances[y]['count'] = 0
        chars = list(str(row['Text']))    
        xy_occurances[y]['count'] = xy_occurances[y]['count'] + len(chars)   
        for char in chars:
            if char in xy_occurances[y]:
                xy_occurances[y][char] = xy_occurances[y][char] + 1
            else:
                xy_occurances[y][char] = 1 
                
                
    est_xy = {}
    for y, counts in xy_occurances.items():
        est_xy[y] = {}
        for x, m in counts.items():
            if (x != 'count'):
                est_xy[y][x] = (m + 1) / (counts['count'] + len(counts) - 1)
    return est_xy

def classifier_multinomial_bayes_net(est_y, est_xy, test_set_x):
    result = {}
    for index, row in test_set_x.iterrows():
        p_yx = {}
        c_frequencies = {}
        chars = list(str(row['Text']).lower().replace(" ", "")) 
        # compute frequencies
        for c in chars:
            if c in c_frequencies:
                c_frequencies[c] = c_frequencies[c] + 1
            else:
                c_frequencies[c] = 1
        # compute probabilities
        for y, p_y in est_y.items():
            p_yx[y] = p_y
            for c, frequency in c_frequencies.items():
                if c in est_xy[y]:
                    p_yx[y] = p_yx[y] * (est_xy[y][c] ** frequency)  
        max_p = -1.0
        max_y = -1
        for y, p in p_yx.items():
            if (max_p < p):
                max_p = p
                max_y = y
        result[index] = max_y
    return pd.DataFrame(list(result.items()), columns=['Id', 'Category'])

cp1252 = {
    # from http://www.microsoft.com/typography/unicode/1252.htm
    u"\u20AC": u"\x80", # EURO SIGN
    u"\u201A": u"\x82", # SINGLE LOW-9 QUOTATION MARK
    u"\u0192": u"\x83", # LATIN SMALL LETTER F WITH HOOK
    u"\u201E": u"\x84", # DOUBLE LOW-9 QUOTATION MARK
    u"\u2026": u"\x85", # HORIZONTAL ELLIPSIS
    u"\u2020": u"\x86", # DAGGER
    u"\u2021": u"\x87", # DOUBLE DAGGER
    u"\u02C6": u"\x88", # MODIFIER LETTER CIRCUMFLEX ACCENT
    u"\u2030": u"\x89", # PER MILLE SIGN
    u"\u0160": u"\x8A", # LATIN CAPITAL LETTER S WITH CARON
    u"\u2039": u"\x8B", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    u"\u0152": u"\x8C", # LATIN CAPITAL LIGATURE OE
    u"\u017D": u"\x8E", # LATIN CAPITAL LETTER Z WITH CARON
    u"\u2018": u"\x91", # LEFT SINGLE QUOTATION MARK
    u"\u2019": u"\x92", # RIGHT SINGLE QUOTATION MARK
    u"\u201C": u"\x93", # LEFT DOUBLE QUOTATION MARK
    u"\u201D": u"\x94", # RIGHT DOUBLE QUOTATION MARK
    u"\u2022": u"\x95", # BULLET
    u"\u2013": u"\x96", # EN DASH
    u"\u2014": u"\x97", # EM DASH
    u"\u02DC": u"\x98", # SMALL TILDE
    u"\u2122": u"\x99", # TRADE MARK SIGN
    u"\u0161": u"\x9A", # LATIN SMALL LETTER S WITH CARON
    u"\u203A": u"\x9B", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    u"\u0153": u"\x9C", # LATIN SMALL LIGATURE OE
    u"\u017E": u"\x9E", # LATIN SMALL LETTER Z WITH CARON
    u"\u0178": u"\x9F", # LATIN CAPITAL LETTER Y WITH DIAERESIS
}

slovak = {'á','ä','č','ď','é','í','ĺ','ľ','ň','ó','ô','ŕ','š','ť','ú','ý','ž'}
german = {'ü', '»', '‚', '…', '“', '‹', 'ö', '‘', '–', 'ß', '—', '«', 'ä', '›', '„'}
spanish = {'ñ','á','é','í','ó','ú','ü','¿','¡','«','»','“','”','‘','’','—','–','…'}
french = {'à','â','ç','é','è','ê','ë','î','ï','ô','œ','ù','û','ü','ÿ','«','“','”','—','»','–','’','…'}
polish = {'ą','ć','ę','ł','ń','ó','ś','ź','ż','„','“','‚','‘','»','«','―','–','…'}
alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','.',',','!','?','"', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
orthographies = slovak.union(german).union(spanish).union(french).union(polish).union(alphabet)

def is_valid_char(c):
    return c in orthographies

def doubledecode(s, as_unicode=True):
    s = s.decode('utf8')
    # remove the windows gremlins O^1
    for src, dest in cp1252.items():
        s = s.replace(src, dest)
    s = s.encode('raw_unicode_escape')
    if as_unicode:
        # return as unicode string
        s = s.decode('utf8', 'ignore')
    return s

def pre_process_train_set_of_only():
    train_set_y = pd.read_csv('train_set_y.csv')
    train_set_x = pd.read_csv('train_set_x.csv')
    train_set_xy = train_set_x.set_index('Id').join(train_set_y.set_index('Id'))
    for index, row in train_set_xy.iterrows():
        line = "".join(list(filter(lambda c: is_valid_char(c), list(str(row['Text'])))))
        train_set_xy.set_value(index, 'Text', line)
    return train_set_xy

def pre_process_test_set_of_only(filepath):
    test_set_x = pd.read_csv(filepath)
    for index, row in test_set_x.iterrows():
        line = "".join(list(filter(lambda c: is_valid_char(c), list(str(row['Text'])))))
        test_set_x.set_value(index, 'Text', line)
    return test_set_x

path_to_test_set_x = 'test_set_x.csv' # Give it a path to your test set file
train_set_xy = pre_process_train_set_of_only()
est_y = learn_y(train_set_xy)
est_xy = learn_xy(train_set_xy)
test_set_x = pre_process_test_set_of_only(path_to_test_set_x)
result = classifier_multinomial_bayes_net(est_y, est_xy, test_set_x)
result.to_csv('test_set_y_final.csv',index=False)