In [50]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup


df = pd.read_csv('reddit-comment-classification-comp-551/reddit_train.csv')
df = df[pd.notnull(df['comments'])]
print(df.head(20))
print(df['comments'].apply(lambda x: len(x.split(' '))).sum())
plt.figure(figsize=(10,4))


df.subreddits.value_counts().plot(kind='bar');


    id                                           comments       subreddits
0    0  Honestly, Buffalo is the correct answer. I rem...           hockey
1    1  Ah yes way could have been :( remember when he...              nba
2    2  https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...  leagueoflegends
3    3  He wouldn't have been a bad signing if we woul...           soccer
4    4  Easy. You use the piss and dry technique. Let ...            funny
5    5  The joke is on YOU!\n\nI've only seen it twice...            funny
6    6  His role in MI3 is one of the best villians I'...           movies
7    7  Akagi is still Alpha as fuck and Sugawara is s...            anime
8    8  I think that they had each other's detonator. ...           movies
9    9  Right! He was a disruptor tank! Pull the dps o...        Overwatch
10  10  The flying the Eagles to Mordor thing is incre...           movies
11  11  "Oh man I can't wait to vote."\n\n*opens link*...            anime
12  12  omg i was thinkin

conspiracy         3500
worldnews          3500
wow                3500
hockey             3500
funny              3500
anime              3500
GlobalOffensive    3500
baseball           3500
nba                3500
AskReddit          3500
Music              3500
leagueoflegends    3500
europe             3500
canada             3500
gameofthrones      3500
Overwatch          3500
movies             3500
soccer             3500
nfl                3500
trees              3500
Name: subreddits, dtype: int64

<Figure size 720x288 with 0 Axes>

In [37]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

def print_plot(index):
    example = df[df.index == index][['comments', 'subreddits']].values[0]
    if len(example) > 0:
        print(example[0])
        print('subreddit:', example[1])


def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['comments'] = df['comments'].apply(clean_text)
df['subreddits'].apply(lambda x: len(x.split(' '))).sum()


[nltk_data] Downloading package stopwords to /Users/j/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


70000

In [53]:
print_plot(1234)

I guess all those finals and top 4 dont count anymore because of one bad tournament. Sick logic you got there
subreddit: GlobalOffensive


In [31]:
X = df.comments
y = df.subreddits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [43]:
values_array = np.unique(df.subreddits.values)

In [51]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

bernoulli_nb = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', BernoulliNB()),
                        ])
bernoulli_nb.fit(X_train, y_train)


multinomial_nb = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', BernoulliNB()),
                        ])
multinomial_nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
y_pred1 = bernoulli_nb.predict(X_test)
y_pred2 = multinomial_nb.predict(X_test)

print('bernoulli accuracy %s' % accuracy_score(y_pred1, y_test))
print(classification_report(y_test, y_pred1,target_names=values_array))

print('multinomial accuracy %s' % accuracy_score(y_pred2, y_test))
print(classification_report(y_test, y_pred2,target_names=values_array))

bernoulli accuracy 0.47828571428571426
                 precision    recall  f1-score   support

      AskReddit       0.27      0.20      0.23      1023
GlobalOffensive       0.40      0.67      0.50      1051
          Music       0.82      0.29      0.43      1055
      Overwatch       0.72      0.60      0.65      1002
          anime       0.74      0.42      0.54      1075
       baseball       0.52      0.59      0.55      1067
         canada       0.48      0.38      0.42       995
     conspiracy       0.50      0.29      0.37      1036
         europe       0.57      0.35      0.44      1018
          funny       0.13      0.48      0.21      1087
  gameofthrones       0.84      0.61      0.71      1024
         hockey       0.58      0.60      0.59      1043
leagueoflegends       0.86      0.51      0.64      1084
         movies       0.70      0.46      0.56      1095
            nba       0.50      0.65      0.56      1055
            nfl       0.71      0.55      0.62  