In [50]:
# ALL The Imports:

import pandas as pd
import numpy as np
from imblearn.pipeline import make_pipeline
import time
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import json

# SKlearn imports:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score, balanced_accuracy_score, RocCurveDisplay, recall_score, precision_score, recall_score, accuracy_score, ConfusionMatrixDisplay, roc_auc_score, precision_score, confusion_matrix
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from nltk.stem import PorterStemmer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [51]:
df = pd.read_csv('./data/titles_sentiment.csv')

In [52]:
# get title as only feature and subreddit as target (Chess or AnarchyChess)
X = df['title']
y = df['subreddit']

# Lemmatize words Logistic Regression

In [53]:
# function to lemmatize the titles
def lem_sentence(sentence):
    lemmatizer = WordNetLemmatizer()

    split_sentence = sentence.split(' ')

    return [lemmatizer.lemmatize(word) for word in split_sentence]

In [54]:
X_lem = X.map(lem_sentence)
X_lem = [' '.join(map(str, l)) for l in X_lem]

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_lem,y, random_state=123, train_size = 0.75)

In [56]:
cvect_pipe = make_pipeline(
    CountVectorizer(stop_words = 'english', max_features = 1000),
                           LogisticRegression(max_iter=10_000, penalty="none"))

In [57]:
cvect_pipe.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000, stop_words='english')),
                ('logisticregression',
                 LogisticRegression(max_iter=10000, penalty='none'))])

In [58]:
cvect_pipe.score(X_train, y_train)

0.9866666666666667

In [59]:
cvect_pipe.score(X_test, y_test)

0.632

# Lemmatized Random Forest

In [60]:
forest_pipe = make_pipeline(
    CountVectorizer(stop_words = 'english', max_features = 20_000),
                           RandomForestClassifier(n_estimators = 1000))

In [61]:
# forest_pipe.score(X_test, y_test)

In [62]:
# forest_pipe.params_

In [63]:
params = {'randomforestclassifier__n_estimators' : [100],
#           'countvectorizer__ngram_range' : [(1,1), (1,2), (1,3)]
}

In [64]:
gs = GridSearchCV(forest_pipe, params, n_jobs =-1)

In [65]:
gs.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('countvectorizer',
                                        CountVectorizer(max_features=20000,
                                                        stop_words='english')),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(n_estimators=1000))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__n_estimators': [100]})

In [66]:
gs.score(X_train, y_train)

0.9946666666666667

In [67]:
gs.score(X_test, y_test)

0.656

# Stemmed Logistic Regression

In [97]:
# function to stem the titles
def stem_sentence(sentence):
    p_stemmer = PorterStemmer()

    split_sentence = sentence.split(' ')

    return [p_stemmer.stem(word) for word in split_sentence]

In [69]:
X_stem = X.map(stem_sentence)
X_stem = [' '.join(map(str, l)) for l in X_stem]

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_stem,y, random_state=123)

In [71]:
cvect_pipe = make_pipeline(CountVectorizer(stop_words = 'english', max_features = 50_000,  ngram_range=(1, 3)),
                           LogisticRegression(max_iter=10_000, penalty="none"))


cvect_pipe.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=50000, ngram_range=(1, 3),
                                 stop_words='english')),
                ('logisticregression',
                 LogisticRegression(max_iter=10000, penalty='none'))])

In [72]:
cvect_pipe.score(X_train, y_train)

0.9946666666666667

In [73]:
cvect_pipe.score(X_test, y_test)

0.66

# Random Forest with subjectivity, polarity and title

In [74]:
df = pd.read_csv('./data/titles_sentiment.csv')

In [75]:
X = df[[
    'title', 
        'polarity', 'subjectivity']]
y = df['subreddit']

In [76]:
y.shape

(1000,)

In [77]:
vectorizer = CountVectorizer( max_features = 20_000)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)

In [79]:
preprocess = make_column_transformer(
    (TfidfVectorizer(stop_words = 'english', max_features = 5_000), 'title'), 
                                     remainder='passthrough')

In [80]:
forest_pipe = make_pipeline(preprocess,
    RandomForestClassifier(n_estimators = 1000, random_state=123))

In [81]:
forest_pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('tfidfvectorizer',
                                                  TfidfVectorizer(max_features=5000,
                                                                  stop_words='english'),
                                                  'title')])),
                ('randomforestclassifier',
                 RandomForestClassifier(n_estimators=1000, random_state=123))])

In [82]:
forest_pipe.score(X_test, y_test)

0.676

# Stem using just title

In [83]:
def stem_sentence(sentence):
    p_stemmer = PorterStemmer()

    split_sentence = sentence.split(' ')

    return [p_stemmer.stem(word) for word in split_sentence]

In [84]:
df = pd.read_csv('./data/titles_len.csv')

In [85]:
X = df[
    'title'
        ]
y = df['subreddit']

In [86]:
X_lem = X.map(lem_sentence)
X_lem = [' '.join(map(str, l)) for l in X_lem]

In [87]:
X_lem_df = pd.DataFrame(X_lem)

In [88]:
X_extra = df[['polarity', 'subjectivity', 'title_len']]

In [89]:
X_new = pd.concat([X_extra, X_lem_df], axis = 1)

In [90]:
X_new.rename(columns={0:'title'}, inplace = True)

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X_new,y, random_state=123)

In [92]:
preprocess = make_column_transformer(
    (TfidfVectorizer(max_features = 5_000, ngram_range=(1, 1)), 'title'),
                                     remainder='passthrough')

In [93]:
forest_pipe = make_pipeline(preprocess,
    LogisticRegression())

In [94]:
forest_pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('tfidfvectorizer',
                                                  TfidfVectorizer(max_features=5000),
                                                  'title')])),
                ('logisticregression', LogisticRegression())])

In [95]:
forest_pipe.score(X_train, y_train)

0.8773333333333333

In [96]:
forest_pipe.score(X_test, y_test)

0.616