In [293]:
from dotenv import load_dotenv
import os
import praw
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
news1 = pd.read_csv('data/2024-06-27 15:03:20.894281_news.csv')
onion1 = pd.read_csv('data/2024-06-26 16:40:52.902750_onion.csv')
news2 = pd.read_csv('data/2024-06-28 15:21:52.863539_news.csv')
onion2 = pd.read_csv('data/2024-06-28 15:21:52.863539_onion.csv')
news3 = pd.read_csv('data/2024-07-02 09:23:01.740295_news.csv')
onion3 = pd.read_csv('data/2024-07-02 09:23:01.740295_onion.csv')
news4 = pd.read_csv('data/2024-07-06 10:57:06.699284_news.csv')
onion4 = pd.read_csv('data/2024-07-06 10:57:06.699284_onion.csv')
news5 = pd.read_csv('data/2024-07-08 17:02:59.738032_news.csv')
onion5 = pd.read_csv('data/2024-07-08 17:02:59.738032_onion.csv')

In [11]:
onion1.drop(columns=['Unnamed: 0'], inplace=True)

onion = pd.concat([onion1, onion2, onion3, onion4, onion5])

onion.drop_duplicates('created_utc', inplace=True)

In [21]:
news1.drop(columns=['Unnamed: 0'], inplace=True)

news = pd.concat([news1, news2, news3, news4, news5])

news.drop_duplicates('created_utc', inplace=True)

In [25]:
posts = pd.concat([onion, news])
posts['is_onion'] = posts['subreddit'].map({'worldnews': 0, 'TheOnion': 1})

In [168]:
posts.to_csv('data/posts.csv', index=False)

In [31]:
# Creating X and y for analysis
X = posts['title']
y = posts['is_onion']

In [33]:
# Checking class imbalance
1-y.mean()

0.591283863368669

In [35]:
# Creating train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [184]:
# Customizing stop words to fit better with the subreddits
sw_nltk = stopwords.words('english')
sw = sw_nltk + ['thread', 'worldnews', 'live', 'onion']

In [297]:
# Creating pipeline with tvec and random forest to be fit onto the training data
pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer(stop_words=sw)),
    ('rt', RandomForestClassifier(n_estimators = 300, oob_score=True))
])

pipe_tvec.fit(X_train, y_train)

In [301]:
# Grid searching over various parameters to find the best model for the training data
%%time
pgrid = {
    'tvec__ngram_range': [(1, 1), (2, 2)],
    'tvec__stop_words': [None, 'english', sw],
    'rt__max_features': np.arange(1, 21),
    'rt__max_depth': [None, 1, 2, 3, 4]
}

gs2 = GridSearchCV(pipe_tvec, param_grid=pgrid, cv=5, n_jobs=4)
gs2.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


CPU times: user 4min 39s, sys: 2min 27s, total: 7min 6s
Wall time: 23min 47s


In [304]:
gs2.score(X_test, y_test)

0.9340866290018832

In [341]:
# Creating confusion matrix to see the types of error that occured in this model
y_gs2_pred = gs2.predict(X_test)
cm3 = metrics.confusion_matrix(y_test, y_gs2_pred)
pd.DataFrame(cm3, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,569,33
Actual Onion,37,423


In [335]:
# More scores for analysis
print(metrics.classification_report(y_test, y_gs2_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94       602
           1       0.93      0.92      0.92       460

    accuracy                           0.93      1062
   macro avg       0.93      0.93      0.93      1062
weighted avg       0.93      0.93      0.93      1062



In [337]:
# Looking specifically at the posts that were marked incorrect
post_rt = pd.DataFrame(X_test.copy())
post_rt['predict'] = y_gs2_pred
post_rt['actual'] = y_test
FalsePos_rt = post_rt.loc[(post_rt['predict'] == 1) & (post_rt['actual'] == 0)]
FalseNeg_rt = post_rt.loc[(post_rt['predict'] == 0) & (post_rt['actual'] == 1)]

In [239]:
posts_sent = posts.copy()

In [241]:
# Instantiating a sentiment analysis for the subreddit titles
sa = SentimentIntensityAnalyzer()
posts_sent['sentiment'] = posts_sent['title'].apply(lambda r: sa.polarity_scores(r)['compound'])

In [243]:
# Creating new X and y to include sentiment score
X = posts_sent[['title', 'sentiment']]
y = posts_sent['is_onion']

In [245]:
# Creating train test split
X_train_sent, X_test_sent, y_train_sent, y_test_sent = train_test_split(X, y, random_state=42)

In [281]:
# preprocessor to count vectorize just the title and pass through the sentiment column
preprocessor = ColumnTransformer(
    transformers=[
        ('cvec', CountVectorizer(stop_words=sw), 'title')
    ],
    remainder='passthrough'
)

In [283]:
# Instantiating and fitting the logistic regression with the preprocesser
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])
pipe.fit(X_train_sent, y_train_sent)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [287]:
# Grid searching through various parameters to find the best model
# Split into two dictionaries because a logistic regression of None will ignore C score and cause the fit to fail
%%time
c_grid = np.logspace(-2, 1, 100)
pgrid = [
    {
        'preprocessor__cvec__stop_words':[None, 'english', sw],
        'preprocessor__cvec__ngram_range': [(1,1), (2,2)],
        'preprocessor__cvec__min_df': [1, 2, 3, 4],
        'logreg__penalty': ['l2'],
        'logreg__C': c_grid
    },
    {
        'preprocessor__cvec__stop_words':[None, 'english', sw],
        'preprocessor__cvec__ngram_range': [(1,1), (2,2)],
        'preprocessor__cvec__min_df': [1, 2, 3, 4],
        'logreg__penalty': [None]
    }
]

gs_sent = GridSearchCV(pipe, param_grid=pgrid, cv=5, n_jobs=4)

gs_sent.fit(X_train_sent, y_train_sent)

CPU times: user 39.8 s, sys: 3.2 s, total: 43 s
Wall time: 5min 17s


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [289]:
gs_sent.score(X_test_sent, y_test_sent)

0.9416195856873822

In [291]:
# Creating Confusion Matrix to see what errors this model committed
y_sent_pred = gs_sent.predict(X_test_sent)
cm_sent = metrics.confusion_matrix(y_test_sent, y_sent_pred)
pd.DataFrame(cm_sent, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,569,33
Actual Onion,29,431


In [339]:
# more scores for analysis
print(metrics.classification_report(y_test, y_sent_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       602
           1       0.93      0.94      0.93       460

    accuracy                           0.94      1062
   macro avg       0.94      0.94      0.94      1062
weighted avg       0.94      0.94      0.94      1062



In [326]:
# Looking at the specific posts that were marked as false to better understand what happened

post_sent = pd.DataFrame(X_test.copy())
post_sent['predict'] = y_sent_pred
post_sent['actual'] = y_test

FalsePos_sent = post_sent.loc[(post_sent['predict'] == 1) & (post_sent['actual'] == 0)]

FalseNeg_sent = post_sent.loc[(post_sent['predict'] == 0) & (post_sent['actual'] == 1)]