In [1]:
# ALL The Imports:

import pandas as pd
import numpy as np
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SMOTENC
import time
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import json

# SKlearn imports:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score, balanced_accuracy_score, RocCurveDisplay, recall_score, precision_score, recall_score, accuracy_score, ConfusionMatrixDisplay, roc_auc_score, precision_score, confusion_matrix
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor, plot_tree, DecisionTreeClassifier
from sklearn import set_config
from sklearn.naive_bayes import GaussianNB, MultinomialNB
set_config(display = 'diagram')
from nltk.stem import PorterStemmer
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, BaggingRegressor,  RandomForestRegressor

In [2]:
df = pd.read_csv('./data/titles_sentiment.csv')

In [3]:
X = df['title']
y = df['subreddit']

# Lemmatize words Logistic Regression

In [4]:
def lem_sentence(sentence):
    lemmatizer = WordNetLemmatizer()

    split_sentence = sentence.split(' ')

    return [lemmatizer.lemmatize(word) for word in split_sentence]

In [5]:
X_lem = X.map(lem_sentence)
X_lem = [' '.join(map(str, l)) for l in X_lem]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_lem,y, random_state=123, train_size = 0.75)

In [7]:
cvect_pipe = make_pipeline(
    CountVectorizer(stop_words = 'english', max_features = 1000),
                           LogisticRegression(max_iter=10_000, penalty="none"))

In [8]:
cvect_pipe.fit(X_train, y_train)

In [9]:
cvect_pipe.score(X_train, y_train)

0.9866666666666667

In [10]:
cvect_pipe.score(X_test, y_test)

0.632

# Lemmatized Random Forest

In [11]:
forest_pipe = make_pipeline(
    CountVectorizer(stop_words = 'english', max_features = 20_000),
                           RandomForestClassifier(n_estimators = 1000))

In [12]:
# forest_pipe.score(X_test, y_test)

In [13]:
# forest_pipe.params_

In [14]:
params = {'randomforestclassifier__n_estimators' : [100],
#           'countvectorizer__ngram_range' : [(1,1), (1,2), (1,3)]
}

In [15]:
gs = GridSearchCV(forest_pipe, params, n_jobs =-1)

In [16]:
gs.fit(X_train, y_train)

In [17]:
gs.score(X_train, y_train)

0.9946666666666667

In [18]:
gs.score(X_test, y_test)

0.652

# Stemmed Logistic Regression

In [20]:
def stem_sentence(sentence):
    p_stemmer = PorterStemmer()

    split_sentence = sentence.split(' ')

    return [p_stemmer.stem(word) for word in split_sentence]

In [21]:
X_stem = X.map(stem_sentence)
X_stem = [' '.join(map(str, l)) for l in X_stem]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_stem,y, random_state=123)

In [23]:
cvect_pipe = make_pipeline(CountVectorizer(stop_words = 'english', max_features = 50_000,  ngram_range=(1, 3)),
                           LogisticRegression(max_iter=10_000, penalty="none"))


cvect_pipe.fit(X_train, y_train)

In [24]:
cvect_pipe.score(X_train, y_train)

0.9946666666666667

In [25]:
cvect_pipe.score(X_test, y_test)

0.66

# Random Forest with subjectivity and polarity

In [26]:
df = pd.read_csv('./data/titles_sentiment.csv')

In [27]:
X = df[[
    'title', 
        'polarity', 'subjectivity']]
y = df['subreddit']

In [28]:
y.shape

(1000,)

In [29]:
vectorizer = CountVectorizer( max_features = 20_000)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)

In [31]:
preprocess = make_column_transformer(
    (TfidfVectorizer(stop_words = 'english', max_features = 5_000), 'title'), 
                                     remainder='passthrough')

In [32]:
forest_pipe = make_pipeline(preprocess,
    RandomForestClassifier(n_estimators = 1000, random_state=123))

In [33]:
forest_pipe.fit(X_train, y_train)

In [34]:
forest_pipe.score(X_test, y_test)

0.676

# Stem with polarity and subjectivity  

In [35]:
def stem_sentence(sentence):
    p_stemmer = PorterStemmer()

    split_sentence = sentence.split(' ')

    return [p_stemmer.stem(word) for word in split_sentence]

In [36]:
df = pd.read_csv('./data/titles_len.csv')

In [37]:
X = df[
    'title'
        ]
y = df['subreddit']

In [38]:
X_lem = X.map(lem_sentence)
X_lem = [' '.join(map(str, l)) for l in X_lem]

In [39]:
X_lem_df = pd.DataFrame(X_lem)

In [40]:
X_extra = df[['polarity', 'subjectivity', 'title_len']]

In [41]:
X_new = pd.concat([X_extra, X_lem_df], axis = 1)

In [42]:
X_new.rename(columns={0:'title'}, inplace = True)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_new,y, random_state=123)

In [44]:
preprocess = make_column_transformer(
    (TfidfVectorizer(max_features = 5_000, ngram_range=(1, 1)), 'title'),
                                     remainder='passthrough')

In [45]:
forest_pipe = make_pipeline(preprocess,
    LogisticRegression())

In [46]:
forest_pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [47]:
forest_pipe.score(X_train, y_train)

0.864

In [48]:
forest_pipe.score(X_test, y_test)

0.62