In [9]:
import numpy as np
from imblearn.pipeline import make_pipeline
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import json

# SKlearn imports:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import GaussianNB, MultinomialNB
from nltk.stem import PorterStemmer
from collections import Counter
from itertools import chain
import pandas as pd
from nltk import word_tokenize, pos_tag
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from nltk.tokenize import word_tokenize 

In [10]:
# nltk.download('averaged_perceptron_tagger')

In [11]:
df = pd.read_csv('./data/titles_sentiment.csv')

# POS Tagging

In [43]:
# https://stackoverflow.com/questions/44088038/create-sentence-row-to-pos-tags-counts-column-matrix-from-a-dataframe

In [13]:
tok_and_tag = lambda x: pos_tag(word_tokenize(x))

In [14]:
df['lower_sent'] = df['title'].apply(str.lower)
df['tagged_sent'] = df['lower_sent'].apply(tok_and_tag)

In [15]:
possible_tags = sorted(set(list(zip(*chain(*df['tagged_sent'])))[1]))

def add_pos_with_zero_counts(counter, keys_to_add):
    for k in keys_to_add:
        counter[k] = counter.get(k, 0)
    return counter


# Detailed steps.
df['pos_counts'] = df['tagged_sent'].apply(lambda x: Counter(list(zip(*x))[1]))
df['pos_counts_with_zero'] = df['pos_counts'].apply(lambda x: add_pos_with_zero_counts(x, possible_tags))
df['sent_vector'] = df['pos_counts_with_zero'].apply(lambda x: [count for tag, count in sorted(x.most_common())])

# All in one.
df['sent_vector'] = df['tagged_sent'].apply(lambda x:
    [count for tag, count in sorted(
        add_pos_with_zero_counts(
            Counter(list(zip(*x))[1]), 
                    possible_tags).most_common()
         )
    ]
)

df2 = pd.DataFrame(df['sent_vector'].tolist())
df2.columns = possible_tags

In [16]:
final_df = pd.concat([df, df2], axis = 1)

In [18]:
pos_df = final_df.drop(final_df.columns[[0, 1, 3, 7, 8, 9, 10]], axis=1)

In [21]:
pos_df[pos_df['subreddit'] == 0].describe()

Unnamed: 0,subreddit,polarity,subjectivity,#,$,'',(,),",",.,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB,``
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.0,0.083961,0.301683,0.004,0.0,0.028,0.082,0.086,0.216,0.784,...,0.736,0.222,0.28,0.162,0.294,0.314,0.04,0.102,0.174,0.022
std,0.0,0.283099,0.295944,0.063182,0.0,0.176857,0.281842,0.287697,0.503837,0.806626,...,1.012103,0.522745,0.546074,0.41485,0.613447,0.569256,0.196155,0.315904,0.405033,0.159897
min,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.2,0.489236,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,0.0,1.0,1.0,1.0,0.0,2.0,2.0,2.0,3.0,4.0,...,8.0,4.0,3.0,3.0,4.0,3.0,1.0,2.0,2.0,2.0


In [22]:
pos_df[pos_df['subreddit'] == 1].describe()

Unnamed: 0,subreddit,polarity,subjectivity,#,$,'',(,),",",.,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB,``
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,1.0,0.05951,0.274991,0.004,0.006,0.026,0.056,0.064,0.178,0.562,...,0.426,0.22,0.218,0.156,0.278,0.284,0.022,0.052,0.15,0.02
std,0.0,0.28786,0.325447,0.063182,0.077304,0.159295,0.311542,0.322664,0.557618,0.892047,...,0.738658,0.565898,0.51673,0.39998,0.570409,0.509768,0.14683,0.23109,0.373871,0.14014
min,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.1,0.514236,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,5.0,5.0,7.0,6.0,...,4.0,5.0,4.0,2.0,4.0,2.0,1.0,2.0,2.0,1.0


## Bagging Classifier

In [23]:
X = pos_df.drop(columns = ['subreddit','polarity', 'subjectivity'])
y = pos_df['subreddit']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, random_state=123)

In [25]:
preprocess = make_column_transformer(
    (CountVectorizer(stop_words = 'english', max_features = 10_000, ngram_range=(1, 3)), 
                             'lower_sent'), 
                                     remainder='passthrough')

In [26]:
forest_pipe = make_pipeline(
    preprocess,
   BaggingClassifier())

In [27]:
forest_pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('countvectorizer',
                                                  CountVectorizer(max_features=10000,
                                                                  ngram_range=(1,
                                                                               3),
                                                                  stop_words='english'),
                                                  'lower_sent')])),
                ('baggingclassifier', BaggingClassifier())])

In [28]:
forest_pipe.score(X_train, y_train)

0.976

In [29]:
forest_pipe.score(X_test, y_test)

0.616

# Naive Bayes

In [30]:
df = pd.read_csv('./data/titles_sentiment.csv')

In [35]:
X = pos_df.drop(columns = ['subreddit','polarity', 'subjectivity'])
y = pos_df['subreddit']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)

In [40]:
preprocess = make_column_transformer(
    (CountVectorizer(stop_words = 'english', max_features = 1000), 'lower_sent'), 
                                     remainder='passthrough')

In [41]:
cvect_pipe2 = make_pipeline(preprocess,
                           MultinomialNB())

cvect_pipe2.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('countvectorizer',
                                                  CountVectorizer(max_features=1000,
                                                                  stop_words='english'),
                                                  'lower_sent')])),
                ('multinomialnb', MultinomialNB())])

In [42]:
cvect_pipe2.score(X_test, y_test)

0.648