In [1]:
import numpy as np
from imblearn.pipeline import make_pipeline
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import json

# SKlearn imports:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import GaussianNB, MultinomialNB
from nltk.stem import PorterStemmer
from collections import Counter
import pandas as pd
from nltk import word_tokenize, pos_tag
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from nltk.tokenize import word_tokenize 

In [2]:
# nltk.download('averaged_perceptron_tagger')

In [3]:
df = pd.read_csv('./data/titles_sentiment.csv')

In [4]:
# https://stackoverflow.com/questions/44088038/create-sentence-row-to-pos-tags-counts-column-matrix-from-a-dataframe

In [5]:
tok_and_tag = lambda x: pos_tag(word_tokenize(x))

In [6]:
df['lower_sent'] = df['title'].apply(str.lower)
df['tagged_sent'] = df['lower_sent'].apply(tok_and_tag)

In [7]:
possible_tags = sorted(set(list(zip(*chain(*df['tagged_sent'])))[1]))

def add_pos_with_zero_counts(counter, keys_to_add):
    for k in keys_to_add:
        counter[k] = counter.get(k, 0)
    return counter


# Detailed steps.
df['pos_counts'] = df['tagged_sent'].apply(lambda x: Counter(list(zip(*x))[1]))
df['pos_counts_with_zero'] = df['pos_counts'].apply(lambda x: add_pos_with_zero_counts(x, possible_tags))
df['sent_vector'] = df['pos_counts_with_zero'].apply(lambda x: [count for tag, count in sorted(x.most_common())])

# All in one.
df['sent_vector'] = df['tagged_sent'].apply(lambda x:
    [count for tag, count in sorted(
        add_pos_with_zero_counts(
            Counter(list(zip(*x))[1]), 
                    possible_tags).most_common()
         )
    ]
)

df2 = pd.DataFrame(df['sent_vector'].tolist())
df2.columns = possible_tags

In [8]:
final_df = pd.concat([df, df2], axis = 1)

In [9]:
final_df

Unnamed: 0.1,Unnamed: 0,selftext,subreddit,title,polarity,subjectivity,lower_sent,tagged_sent,pos_counts,pos_counts_with_zero,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB,``
0,0,"I found that asking myself the questions, ""whi...",0,"Higher Rated Players, as you go through the ga...",-0.075000,0.450000,"higher rated players, as you go through the ga...","[(higher, RB), (rated, VBN), (players, NNS), (...","{'RB': 1, 'VBN': 1, 'NNS': 2, ',': 1, 'IN': 3,...","{'RB': 1, 'VBN': 1, 'NNS': 2, ',': 1, 'IN': 3,...",...,1,0,1,1,2,0,0,1,0,0
1,1,Found that I often have to recalculate a line ...,0,How to develop calculations/memory? Do higher ...,0.125000,0.312500,how to develop calculations/memory? do higher ...,"[(how, WRB), (to, TO), (develop, VB), (calcula...","{'WRB': 1, 'TO': 1, 'VB': 2, 'NN': 2, '.': 2, ...","{'WRB': 1, 'TO': 1, 'VB': 2, 'NN': 2, '.': 2, ...",...,2,0,0,0,1,0,0,0,1,0
2,2,,0,"When doing tactics, should I take time to coun...",0.261905,0.378571,"when doing tactics, should i take time to coun...","[(when, WRB), (doing, VBG), (tactics, NNS), (,...","{'WRB': 1, 'VBG': 1, 'NNS': 1, ',': 3, 'MD': 1...","{'WRB': 1, 'VBG': 1, 'NNS': 1, ',': 3, 'MD': 1...",...,5,0,1,0,1,1,1,0,1,0
3,3,Hi guys! Long time member of this sub. I've be...,0,"/r/chess -- who should I be rooting for, and w...",0.000000,0.000000,"/r/chess -- who should i be rooting for, and w...","[(/r/chess, NN), (--, :), (who, WP), (should, ...","{'NN': 2, ':': 1, 'WP': 1, 'MD': 1, 'VB': 2, '...","{'NN': 2, ':': 1, 'WP': 1, 'MD': 1, 'VB': 2, '...",...,2,0,1,0,1,0,0,1,1,0
4,4,,0,"After winning the Rapid WCC, Ivanchuk is looki...",0.250000,0.375000,"after winning the rapid wcc, ivanchuk is looki...","[(after, IN), (winning, VBG), (the, DT), (rapi...","{'IN': 1, 'VBG': 3, 'DT': 2, 'JJ': 2, 'NN': 2,...","{'IN': 1, 'VBG': 3, 'DT': 2, 'JJ': 2, 'NN': 2,...",...,0,0,3,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,,1,"Good luck in the final, Teimour Radjabov!",0.350000,0.800000,"good luck in the final, teimour radjabov!","[(good, JJ), (luck, NN), (in, IN), (the, DT), ...","{'JJ': 3, 'NN': 2, 'IN': 1, 'DT': 1, ',': 1, '...","{'JJ': 3, 'NN': 2, 'IN': 1, 'DT': 1, ',': 1, '...",...,0,0,0,0,0,0,0,0,0,0
996,996,,1,Chess themed cracker joke,0.000000,0.000000,chess themed cracker joke,"[(chess, NN), (themed, VBD), (cracker, NN), (j...","{'NN': 3, 'VBD': 1, '#': 0, '$': 0, '''': 0, '...","{'NN': 3, 'VBD': 1, '#': 0, '$': 0, '''': 0, '...",...,0,1,0,0,0,0,0,0,0,0
997,997,,1,Checkers sucks.,-0.300000,0.300000,checkers sucks.,"[(checkers, NNS), (sucks, NNS), (., .)]","{'NNS': 2, '.': 1, '#': 0, '$': 0, '''': 0, '(...","{'NNS': 2, '.': 1, '#': 0, '$': 0, '''': 0, '(...",...,0,0,0,0,0,0,0,0,0,0
998,998,,1,Met Magnus today! 😄,0.000000,0.000000,met magnus today! 😄,"[(met, VBN), (magnus, NNS), (today, NN), (!, ....","{'VBN': 1, 'NNS': 1, 'NN': 2, '.': 1, '#': 0, ...","{'VBN': 1, 'NNS': 1, 'NN': 2, '.': 1, '#': 0, ...",...,0,0,0,1,0,0,0,0,0,0


In [10]:
pos_df = final_df.drop(final_df.columns[[0, 1, 3, 7, 8, 9, 10]], axis=1)

In [11]:
pos_df[pos_df['subreddit'] == 0].describe()

Unnamed: 0,subreddit,polarity,subjectivity,#,$,'',(,),",",.,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB,``
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.0,0.083961,0.301683,0.004,0.0,0.028,0.082,0.086,0.216,0.784,...,0.736,0.222,0.28,0.162,0.294,0.314,0.04,0.102,0.174,0.022
std,0.0,0.283099,0.295944,0.063182,0.0,0.176857,0.281842,0.287697,0.503837,0.806626,...,1.012103,0.522745,0.546074,0.41485,0.613447,0.569256,0.196155,0.315904,0.405033,0.159897
min,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.2,0.489236,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,0.0,1.0,1.0,1.0,0.0,2.0,2.0,2.0,3.0,4.0,...,8.0,4.0,3.0,3.0,4.0,3.0,1.0,2.0,2.0,2.0


In [12]:
pos_df[pos_df['subreddit'] == 1].describe()

Unnamed: 0,subreddit,polarity,subjectivity,#,$,'',(,),",",.,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB,``
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,1.0,0.05951,0.274991,0.004,0.006,0.026,0.056,0.064,0.178,0.562,...,0.426,0.22,0.218,0.156,0.278,0.284,0.022,0.052,0.15,0.02
std,0.0,0.28786,0.325447,0.063182,0.077304,0.159295,0.311542,0.322664,0.557618,0.892047,...,0.738658,0.565898,0.51673,0.39998,0.570409,0.509768,0.14683,0.23109,0.373871,0.14014
min,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.1,0.514236,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,5.0,5.0,7.0,6.0,...,4.0,5.0,4.0,2.0,4.0,2.0,1.0,2.0,2.0,1.0


In [13]:
X = pos_df.drop(columns = ['subreddit','polarity', 'subjectivity'])
y = pos_df['subreddit']

In [14]:
X

Unnamed: 0,lower_sent,#,$,'',(,),",",.,:,CC,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB,``
0,"higher rated players, as you go through the ga...",0,0,0,1,1,1,2,0,0,...,1,0,1,1,2,0,0,1,0,0
1,how to develop calculations/memory? do higher ...,0,0,0,0,0,0,2,0,0,...,2,0,0,0,1,0,0,0,1,0
2,"when doing tactics, should i take time to coun...",0,0,0,0,0,3,1,0,2,...,5,0,1,0,1,1,1,0,1,0
3,"/r/chess -- who should i be rooting for, and w...",0,0,0,0,0,1,2,1,1,...,2,0,1,0,1,0,0,1,1,0
4,"after winning the rapid wcc, ivanchuk is looki...",0,0,0,0,0,1,0,0,0,...,0,0,3,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"good luck in the final, teimour radjabov!",0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
996,chess themed cracker joke,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
997,checkers sucks.,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
998,met magnus today! 😄,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, random_state=123)

In [16]:
preprocess = make_column_transformer(
    (CountVectorizer(stop_words = 'english', max_features = 10_000, ngram_range=(1, 3)), 
                             'lower_sent'), 
                                     remainder='passthrough')

In [17]:
forest_pipe = make_pipeline(
    preprocess,
   BaggingClassifier())

In [18]:
forest_pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('countvectorizer',
                                                  CountVectorizer(max_features=10000,
                                                                  ngram_range=(1,
                                                                               3),
                                                                  stop_words='english'),
                                                  'lower_sent')])),
                ('baggingclassifier', BaggingClassifier())])

In [19]:
forest_pipe.score(X_train, y_train)

0.964

In [20]:
forest_pipe.score(X_test, y_test)

0.608

# Naive Bayes

In [21]:
df = pd.read_csv('./data/titles_sentiment.csv')

In [23]:
X = df[[
    'title', 
        'polarity', 'subjectivity']]
y = df['subreddit']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)

In [25]:
preprocess = make_column_transformer(
    (CountVectorizer(stop_words = 'english', max_features = 1000), 'title'), 
                                     remainder='passthrough')

In [27]:
cvect_pipe2 = make_pipeline(preprocess,
                           MultinomialNB())

cvect_pipe2.fit(X_train,y_train)

ValueError: Negative values in data passed to MultinomialNB (input X)