In [2]:
# based on tutorial from https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568

import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup


df = pd.read_csv('reddit-comment-classification-comp-551/reddit_train.csv')
df = df[pd.notnull(df['comments'])]
print(df.head(20))
print(df['comments'].apply(lambda x: len(x.split(' '))).sum())



df.subreddits.value_counts().plot(kind='bar');


    id                                           comments       subreddits
0    0  Honestly, Buffalo is the correct answer. I rem...           hockey
1    1  Ah yes way could have been :( remember when he...              nba
2    2  https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...  leagueoflegends
3    3  He wouldn't have been a bad signing if we woul...           soccer
4    4  Easy. You use the piss and dry technique. Let ...            funny
5    5  The joke is on YOU!\n\nI've only seen it twice...            funny
6    6  His role in MI3 is one of the best villians I'...           movies
7    7  Akagi is still Alpha as fuck and Sugawara is s...            anime
8    8  I think that they had each other's detonator. ...           movies
9    9  Right! He was a disruptor tank! Pull the dps o...        Overwatch
10  10  The flying the Eagles to Mordor thing is incre...           movies
11  11  "Oh man I can't wait to vote."\n\n*opens link*...            anime
12  12  omg i was thinkin

The classes are balanced, but the text needs cleaning. Here's some cleaning (we should customize **TODO: we should make something to remove links... that is words starting with `http://` at least, should be ignored**):

In [3]:
delimiters = re.compile('[/(){}\[\]\|@,;]')
ignored_symbols = re.compile('[^0-9a-z #+_]')
# nltk.download('stopwords')
stopwords = set(stopwords.words('english'))


def print_plot(index):
    example = df[df.index == index][['comments', 'subreddits']].values[0]
    if len(example) > 0:
        print(example[0])
        print('subreddit:', example[1])


def clean_text(text):
    """
        text: a string (one comment)
        return: modified string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = delimiters.sub(' ', text) # replace delimiters symbols by space in text
    text = ignored_symbols.sub('', text) # delete symbols which are in ignored_symbols from text
    text = ' '.join(word for word in text.split() if word not in stopwords) # delete stopwords from text
    return text
    
df['comments'] = df['comments'].apply(clean_text)


Now to lemmatize the corpus.  This might not help in the short term, but will be useful to play with. 


In [4]:
from nltk.tokenize import word_tokenize

from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

def lemmatize_sentence(sen):
    """ lemmatizes every word in space separated sentence sen""" 
    token_list = word_tokenize(sen)
    lemma_sen = []
    for w in token_list:
        lemma_sen.append(stemmer.stem(w))
    return " ".join(lemma_sen)

df['comments'] = df['comments'].apply(lambda x: lemmatize_sentence(x))

# print_plot(1234)
print(df.head(10))

   id                                           comments       subreddits
0   0  honest buffalo correct answ rememb peopl somew...           hockey
1   1  ah ye way could rememb draft thought gon na gr...              nba
2   2  https youtub 6xxbbr8isz0t40m49sif didnt find a...  leagueoflegends
3   3  wouldnt bad sign wouldnt paid 18m euro right p...           soccer
4   4  easy us piss dry techn let drop let dry rins r...            funny
5   5                                  jok you seen twic            funny
6   6  rol mi3 on best vil iv seen movy genuin felt l...           movies
7   7  akag stil alph fuck sugawar suff definit two f...            anime
8   8  think oth deton wouldnt prov jok right blew bo...           movies
9   9  right disrupt tank pul dps frey pick get point...        Overwatch


We could set the train test split here, or we could do some more processing

In [5]:
# X_train, X_test, y_train, y_test = train_test_split(df.comments, df.subreddits, test_size=0.3, random_state = 12)

In [6]:
values_array = np.unique(df.subreddits.values)

Defining a function to get **a list of words in the entire corpus of comments** (that is, `tokens`), and also **a list unique words** (that is `types`).

In [7]:
# import itertools

# tokens_list = df['comments'].apply(lambda x: word_tokenize(x)).values

# tokens = np.array(list(itertools.chain.from_iterable(tokens_list)))

# types, type_counts = np.unique(tokens, return_counts=True)

# print("number of tokens",len(tokens))
# print("number of types",len(types))

number of tokens 1601210
number of types 75440


Some words are more common than others.  We might want to do something with this. But for now it's just useful to have the information.  I'll make an uncommon-word list. We can remove them just like we did the stopwords. We'll do this later, with scikitlearn's CountVectorizer.

In [8]:
common_words = types[type_counts > 20]
len(common_words)

5379

In [9]:
# def downsize_vocab(text):
#     text = ' '.join(word for word in text.split() if word in common_words) # keep only common words
#     return text
    
# df['comments'] = df['comments'].apply(downsize_vocab)

In [10]:
# tokens_list = df['comments'].apply(lambda x: word_tokenize(x)).values

# tokens = np.array(list(itertools.chain.from_iterable(tokens_list)))

# types, type_counts = np.unique(tokens, return_counts=True)

# print("number of tokens",len(tokens))
# print("number of types",len(types))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.comments, df.subreddits, test_size=0.3, random_state = 12)

In [12]:
#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf = TfidfTransformer()
#X_tfidf = tfidf.fit_transform(X_tf)

In [13]:
# X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df.subreddits, test_size=0.3, random_state = 12)

## Running sklearn classifier model(s)

In [182]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

from sklearn.naive_bayes import BernoulliNB
bernoulli_nb = Pipeline([
                         ('ct_vect', CountVectorizer()),
#                          ('tfidf', TfidfTransformer()),
                         ('clf', BernoulliNB()),
                        ])

from sklearn.linear_model import SGDClassifier
sgd = Pipeline([
                 ('ct_vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                       alpha=1e-3, random_state=27,
                                       max_iter=5, tol=None)),
                ])


def paramsearch(modelpipeline):
    '''
    modelpipeline: sklearn.pipeline.Pipeline object 
    does gridsearch on the given pipeline on test split and prints classification report
    '''
    from sklearn.model_selection import GridSearchCV
    parameters = {
        'ct_vect__ngram_range': [(1,1)],
        'ct_vect__max_features': [30000],
#         'tfidf__use_idf': (False),
#         'clf__alpha': (1e-4, 1e-5),
    }

    gridsearch = GridSearchCV(modelpipeline, parameters, cv=5, iid=False, n_jobs=-1)
    gridsearch = gridsearch.fit(X_train, y_train)
    y_pred = gridsearch.predict(X_test)
#     print(gridsearch)
    print(gridsearch.best_params_)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=values_array))

def runmodel(modelpipeline):
    ''' 
    modelpipeline: sklearn.pipeline.Pipeline object 
    runs the given pipeline on test split and prints classification report
    '''
    modelpipeline.fit(X_train, y_train)
    y_pred = modelpipeline.predict(X_test)
    print(modelpipeline)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=values_array))

paramsearch(bernoulli_nb)

#runmodel(sgd)

{'ct_vect__max_features': 30000, 'ct_vect__ngram_range': (1, 1)}
accuracy 0.47828571428571426
                 precision    recall  f1-score   support

      AskReddit       0.29      0.19      0.23      1039
GlobalOffensive       0.44      0.64      0.52      1028
          Music       0.71      0.30      0.42      1011
      Overwatch       0.81      0.54      0.64      1085
          anime       0.69      0.44      0.54      1067
       baseball       0.60      0.56      0.58      1091
         canada       0.46      0.37      0.41      1012
     conspiracy       0.48      0.32      0.39      1019
         europe       0.59      0.38      0.46      1084
          funny       0.13      0.57      0.21      1053
  gameofthrones       0.89      0.56      0.69      1050
         hockey       0.55      0.59      0.57      1013
leagueoflegends       0.72      0.57      0.63      1013
         movies       0.60      0.51      0.55      1054
            nba       0.58      0.63      0.60    

## Our own NB

In [101]:
def fit_naive_bayes(observations, y, num_features):

    #Initialize marginal probability for each class
    count_class = np.array(20*[[0]])
    marg_prob = np.array(20*[[1]]) #Laplace smoothing, starting counts with 1

    #Initialize matrix of probabilities of observed features given k
    cond_prob_matrix = np.empty((20,num_features))

    
    #compute marginal probability of each class
    total_comments = len(y)
    for i in range(20):
        for j in range(total_comments):
            if y[j] == i:
                count_class[i] += 1
    
    #Marginal probability for each class
    marg_prob = np.true_divide(count_class, total_comments)

    for i in range(observations.shape[0]):
        feature_no = observations.nonzero()[1][i]
        comment_no = observations.nonzero()[0][i]
       
        comment_class = y[comment_no]
        cond_prob_matrix[comment_class][feature_no] += 1

    print("First step youhou")

    #divide each row of cond_prob_matrix by the count of comments per class
    for i in range(20):
        cond_prob_matrix[i] = np.true_divide(cond_prob_matrix[i], count_class[i])

    print("Second step youhou")


    cond_prob_matrix = cond_prob_matrix.transpose()
    marg_prob = np.log(marg_prob)

    return marg_prob, cond_prob_matrix

In [102]:
classes = {
        "anime": 1,
        "AskReddit": 2,
        "baseball": 3,
        "canada": 4, 
        "conspiracy": 5, 
        "europe": 6, 
        "funny": 7, 
        "gameofthrones": 8, 
        "GlobalOffensive": 9,
        "hockey" :10, 
        "leagueoflegends": 11, 
        "movies": 12, 
        "Music": 13, 
        "nba":14, 
        "nfl":15, 
        "Overwatch":16, 
        "soccer":17, 
        "trees":18, 
        "worldnews":19, 
        "wow":0
    }

y_traindf = pd.DataFrame(y_train)
y_traindf['subreddits']= y_traindf['subreddits'].map(classes)
y_train_array = np.array(y_traindf['subreddits'])

In [109]:
cv = CountVectorizer(max_features=500)
X_train_tf = cv.fit_transform(X_train)
X_test_tf = cv.transform(X_test)

In [110]:
# from naive_bayes import fit_naive_bayes
prior, conditional = fit_naive_bayes(X_train_tf, y_train_array, X_train_tf.shape[1])

First step youhou
Second step youhou


In [113]:
ID_list = X_test.index.tolist()

In [141]:
from scipy import sparse

def predict_naive_bayes(id_list, observations, marg_prob, cond_prob_matrix):

    #log of inverse conditional probability matrix
    inv_cond_prob_matrix = np.ones((cond_prob_matrix.shape[0], cond_prob_matrix.shape[1]))
    inv_cond_prob_matrix = inv_cond_prob_matrix - cond_prob_matrix
    inv_cond_prob_matrix = sparse.csr_matrix(np.log(inv_cond_prob_matrix))

    #log of conditional probability matrix
    cond_prob_matrix = sparse.csr_matrix(np.log(cond_prob_matrix))
    
    # 0s become 1s, 1s become 0s
    sparse_ones = sparse.csr_matrix(np.ones((observations.shape[0], observations.shape[1])))
    complement_obs = sparse_ones - observations

    prob_per_class = np.dot(observations,cond_prob_matrix) + np.dot(complement_obs,inv_cond_prob_matrix)

    y = []
    for i in range(observations.shape[0]):
        prob_per_class[i] += marg_prob.transpose()
        y.append(np.argmax(prob_per_class[i]))

    id_list = np.array(id_list).transpose()

    matrix = np.stack((id_list, y)).transpose()
    df_pred = pd.DataFrame(matrix)

    return df_pred

In [142]:
predictions = predict_naive_bayes(ID_list, X_test_tf, prior, conditional)

  # This is added back by InteractiveShellApp.init_path()


In [143]:
predictions

Unnamed: 0,0,1
0,26505,13
1,16099,1
2,35596,19
3,62735,1
4,67323,6
...,...,...
20995,23873,16
20996,15156,13
20997,1645,1
20998,4919,11


In [146]:
y_testdf = pd.DataFrame(y_test)
y_testdf['subreddits']= y_testdf['subreddits'].map(classes)
y_test_array = np.array(y_testdf['subreddits'])

In [174]:
predictions[1]

0        13
1         1
2        19
3         1
4         6
         ..
20995    16
20996    13
20997     1
20998    11
20999    13
Name: 1, Length: 21000, dtype: int64

In [175]:
def accuracy(df_pred, df_true_y):

    pred = np.array(df_pred[1])
    true_y = np.array(df_true_y['subreddits'])

    count = 0
    total = len(true_y)
    for i in range(total):
        if pred[i] == true_y[i]:
            count +=1
            
    return float(count)/total

In [176]:
accuracy(predictions, y_testdf)

0.18342857142857144