In [None]:
import time, random, re, pprint, string
import sys
import pandas as pd 
import nltk
import numpy as np
from nltk.corpus import stopwords
from collections import Counter
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from itertools import islice, chain

In [103]:
train_input = pd.read_csv('train_input.csv')
train_output = pd.read_csv('train_output.csv')

In [104]:
delchars = ''.join(c for c in map(chr, range(256)) if not c.isalpha())
delchars = ''.join(ch for ch in delchars if ch !=' ')

def textClean(text):
    #cleaning the html
    output = re.sub('\<.*?\>','', text)
    output = re.sub('\@.*?\s','', output)
    output = re.sub('\n','', output)
    output = output.translate(str.maketrans('','',delchars))
    return output

def removeStopwords(word_list): 
    return list(set(word_list) - set(stopwords.words('English')))

count_stops = Counter(stopwords.words("English")*100)

def removeStopwordsCount(words_list):
    return Counter(words_list) - count_stops

def findNGrams(input_list, n):
    grams = list(zip(*[input_list[i:] for i in range(n)]))
    return [''.join(x) for x in grams]

stopword_set = set(stopwords.words("English") + list(string.ascii_lowercase))
#Stopwords + Individual letters

def removeStopwordsList(word_list): 
    return [word for word in word_list if word not in stopword_set]

def generate_features(dataframe):
    start = time.time()
    dataframe["text"] = dataframe["conversation"].apply(lambda x: textClean(x))
    dataframe["words"] = dataframe["text"].apply(lambda x: x.split()) 
    dataframe["words"] = dataframe["words"].apply(lambda x: removeStopwordsList(x))
    dataframe["words"] = dataframe["words"].apply(lambda x: x + findNGrams(x,2) + findNGrams(x,3))
    dataframe["words_count"] = dataframe["words"].apply(lambda x: Counter(x))
    return dataframe

In [105]:
#stripping out all stopwords
train_input = generate_features(train_input)
train_input["output"] = train_output["category"]

Regex done! : 3.5700278282165527
Stopwords done! : 96.16917991638184
ngrams done! : 188.8405728340149


In [None]:
def mergeSum(list):
    if len(list) < 2:
        return list.iloc[0]
    else: 
        mid = len(list)//2
        
        right = mergeSum(list[mid:])
        left = mergeSum(list[:mid])

        sum = left + right
        return sum

In [None]:
total_counter = Counter({k: v for k, v in total_counter.items() if v > 1})

In [107]:
#Word Counter for each group
groups_counter = train_input.groupby("output")["words"].apply(lambda x: mergeSum(x))
groups_counter = groups_counter.apply(lambda x: Counter(x))

In [108]:
total_wordcount = sum(total_counter.values())
total_word_freq = Counter({k:v/total_wordcount for k,v in total_counter.items()})

In [109]:
group_word_freq = {}
groups_freq = groups_counter
for label in train_input["output"].unique():
    group_wordcount = sum(groups_counter[label].values())
    group_word_freq[label] = Counter({k:v/group_wordcount for k,v in groups_freq[label].items()})

In [110]:
def GetValDict(my_dict,key):
    if key in my_dict: 
        return my_dict[key]
    else:
        return 0
    
group_word_laplace = {}
total_words = len(total_counter)

for label in train_input["output"].unique():
    temp_group = groups_counter[label]
    group_wordcount = sum(groups_counter[label].values())
    #Conditional probability calculation with laplace smoothing
    group_word_laplace[label] = Counter({k:(GetValDict(temp_group,k)+1)/(v + total_words)for k,v in total_counter.items()})

In [111]:
#Build a counter for the IDFs of each word in our corpus
words_doc_list = [list(counter.keys()) for counter in list(train_input["words_count"].values)]
words_doc_counter = Counter(chain.from_iterable(set(x) for x in words_doc_list))
num_docs = len(train_input)
total_words_idf = Counter({k: np.log(num_docs / words_doc_counter[k]) for k in total_counter.keys()})

In [112]:
group_word_idf = {}
total_words = len(total_counter)
for label in train_input["output"].unique():
    temp_group = groups_counter[label]
    group_wordcount = sum(groups_counter[label].values())
    #Conditional probability calculation with laplace smoothing
    group_word_idf[label] = Counter({k:((temp_group[k] + 1) / (v + total_words) * max(total_words_idf[k],0.3) )for k,v in total_counter.items()})

In [113]:
#Now, create a function to predict class for each text snippet 
categories = train_input["output"].unique()
class_priors = {}
for category in categories:
    class_priors[category] = train_input.groupby("output").size()[category] / len(train_input)
    
total_words = len(total_counter)

    
def getCondIdf(word, category): 
    #function that gets around cases where we haven't seen the word before
    if word in group_word_idf[category]:
        return group_word_idf[category][word]
    else: 
        return (1 / total_words)    
    
def predictClassIdf(word_counter):
    classes_prob = {}
    for category in categories:
        classes_prob[category] = 1
        for k, v in word_counter.items():
            classes_prob[category] *= (getCondIdf(k,category) ** v) * 1e5
        classes_prob[category] *= class_priors[category]
        #update with the prior class probability 
    return max(classes_prob, key = classes_prob.get)  

In [114]:
def predict(train_df, test_df, freq_cutoff): 
    ##Function Wrapper for the entire prediction process
    
    start = time.time()
    total_counter = Counter(mergeSum(train_df["words"]))
    total_counter = Counter({k: v for k, v in total_counter.items() if v > freq_cutoff})
    groups_counter = train_df.groupby("output")["words"].apply(lambda x: mergeSum(x))
    groups_counter = groups_counter.apply(lambda x: Counter(x))    
    total_wordcount = sum(total_counter.values())
    total_word_freq = Counter({k:v/total_wordcount for k,v in total_counter.items()})
    
    group_word_freq = {}
    groups_freq = groups_counter
    for label in train_df["output"].unique():
        group_wordcount = sum(groups_counter[label].values())
        group_word_freq[label] = Counter({k:v/group_wordcount for k,v in groups_freq[label].items()})

    words_doc_list = [list(counter.keys()) for counter in list(train_df["words_count"].values)]
    words_doc_counter = Counter(chain.from_iterable(set(x) for x in words_doc_list))
    num_docs = len(train_input)
    total_words_idf = Counter({k: np.log(num_docs / words_doc_counter[k]) for k in total_counter.keys()})
    
    group_word_idf = {}
    total_words = len(total_counter)
    for label in train_df["output"].unique():
        temp_group = groups_counter[label]
        group_wordcount = sum(groups_counter[label].values())
        #Conditional probability calculation with laplace smoothing
        group_word_idf[label] = Counter({k:((temp_group[k] + 1) / (v + total_words) * max(total_words_idf[k],0.3) )for k,v in total_counter.items()})

    categories = train_df["output"].unique()
    class_priors = {}
    for category in categories:
        class_priors[category] = train_df.groupby("output").size()[category] / len(train_input)
    total_words = len(total_counter)
    
    prediction = test_df["words_count"].apply(lambda x: predictClassIdf(x))
    return prediction

In [115]:
k_stop = round(0.8*len(train_input))
train_data_k = train_input[:k_stop]
test_data_k = train_input[k_stop:]
true_test_k = train_input[k_stop:]["output"]
my_prediction_k = predict(train_data_k, test_data_k, 5)

In [None]:
def k_cross(train_df, k, freq_cutoff): 
    #Function to do k-fold cross validation to get better accuracy statistics
    #We partition the training set into training and test dat
    start = time.time()
    accuracies = []
    
    partition_size = round(len(train_df) / k)    
    
    for i in range(0,k):
        test_data_k = train_df[(partition_size * i):(partition_size * (i + 1))]
        train_data_k = train_df[~train_df.index.isin(test_data_k.index)]
        my_prediction_k = predict(train_data_k, test_data_k, freq_cutoff)
        accuracy_k = sum(my_prediction_k == test_data_k["output"]) / len(test_data_k)
        accuracies = accuracies + [accuracy_k]        
    return accuracies

In [None]:
validated_accuracies = {}
for freq_hyper in range(0,5):
    validated_accuracies[freq_hyper] = k_cross(train_input[0:100000],5, freq_hyper)

In [None]:
#Evaluate Accuracy
raw_accuracy = sum(my_prediction_k == true_test_k) / len(true_test_k)

In [None]:
test_data_k["predicted_out"] = my_prediction_k

In [None]:
test_input = pd.read_csv('test_input.csv')

In [None]:
test_input = generate_features(test_input)

In [None]:
test_output_3gram1 = test_input["words_count"].apply(lambda x: predictClassIdf(x))

In [None]:
test_output = pd.DataFrame(test_input["id"])
test_output["category"] = test_output_3gram1
test_output.to_csv("prediction.csv", index = False)