In [1]:
from dotenv import load_dotenv
import os
import praw
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
news1 = pd.read_csv('data/2024-06-27 15:03:20.894281_news.csv')
onion1 = pd.read_csv('data/2024-06-26 16:40:52.902750_onion.csv')
news2 = pd.read_csv('data/2024-06-28 15:21:52.863539_news.csv')
onion2 = pd.read_csv('data/2024-06-28 15:21:52.863539_onion.csv')
news3 = pd.read_csv('data/2024-07-02 09:23:01.740295_news.csv')
onion3 = pd.read_csv('data/2024-07-02 09:23:01.740295_onion.csv')
news4 = pd.read_csv('data/2024-07-06 10:57:06.699284_news.csv')
onion4 = pd.read_csv('data/2024-07-06 10:57:06.699284_onion.csv')

In [5]:
onion1.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
onion4.shape

(999, 4)

In [9]:
onion = pd.concat([onion1, onion2, onion3, onion4])

In [11]:
len(onion['created_utc'].unique())

999

In [13]:
onion.drop_duplicates('created_utc', inplace=True)

In [15]:
len(onion['created_utc'].unique())

999

In [17]:
news1.drop(columns=['Unnamed: 0'], inplace=True)

In [19]:
news4.shape

(866, 4)

In [21]:
news = pd.concat([news1, news2, news3, news4])

In [23]:
len(news['created_utc'].unique())

1725

In [25]:
news.drop_duplicates('created_utc', inplace=True)

In [27]:
len(news['created_utc'].unique())

1725

In [29]:
posts = pd.concat([onion, news])
posts['is_onion'] = posts['subreddit'].map({'worldnews': 0, 'TheOnion': 1})

In [31]:
posts.shape

(2724, 5)

In [33]:
posts.to_csv('data/posts.csv')

In [173]:
X = posts['title']
y = posts['is_onion']

In [37]:
1-y.mean()

0.6332599118942731

In [175]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [177]:
sw_nltk = stopwords.words('english')
sw = sw_nltk + ['thread', 'worldnews', 'live']

In [179]:
pipe_tvnb = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

pipe_tvnb.fit(X_train, y_train)

In [45]:
params = {
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tvec__stop_words': [None, 'english', sw],
    'tvec__max_features': [None, 2000, 5000, 10000, 20000]
}

gs = GridSearchCV(pipe_tvnb, param_grid=params, cv=5, n_jobs=4)

In [47]:
%%time
gs.fit(X_train, y_train)

CPU times: user 488 ms, sys: 128 ms, total: 616 ms
Wall time: 6.55 s


In [49]:
gs.score(X_test, y_test)

0.922173274596182

In [51]:
y_hat = gs.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_hat)
cm

array([[424,  11],
       [ 42, 204]])

In [53]:
pd.DataFrame(cm, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,424,11
Actual Onion,42,204


In [309]:
post_nb = pd.DataFrame(X_test.copy())
post_nb['predict'] = y_hat
post_nb['actual'] = y_test
wrong = post_nb['title'][post_nb['predict'] != post_nb['actual']]

In [339]:
pipe_logr = Pipeline([
    ('tvec', TfidfVectorizer(stop_words='english')),
    ('logr', LogisticRegression(max_iter=1000, penalty=None))
])

X = posts['title']
y = posts['is_onion']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [341]:
pipe_logr.fit(X_train, y_train)

In [205]:
pipe_logr.score(X_test, y_test)

0.9295154185022027

In [327]:
pipe_logr.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer(stop_words='english')),
  ('logr', LogisticRegression(max_iter=1000, penalty=None))],
 'verbose': False,
 'tvec': TfidfVectorizer(stop_words='english'),
 'logr': LogisticRegression(max_iter=1000, penalty=None),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': 'english',
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'logr__C': 1.0,
 'logr__class_weight': None,
 'logr__dual': False,
 'logr__fit_intercept': True,
 'logr__intercept_scaling': 1,
 'logr__l1_ratio': 

In [329]:
%%time
params = {
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (2,2)],
    'tvec__min_df': [1, 2, 3],
    'logr__penalty': [None, 'l2'],
    'logr__C': np.logspace(-2, 1, 100)
}

rcv = RandomizedSearchCV(pipe_logr, params, n_iter=11, cv=5)
rcv.fit(X_train, y_train)



CPU times: user 5.77 s, sys: 1.21 s, total: 6.98 s
Wall time: 3.73 s




In [331]:
rcv.best_params_

{'tvec__stop_words': None,
 'tvec__ngram_range': (1, 1),
 'tvec__min_df': 2,
 'logr__penalty': None,
 'logr__C': 1.4174741629268048}

In [333]:
rcv.score(X_test, y_test)

0.9295154185022027

In [343]:
y_predict = rcv.predict(X_test)
cm_logr = metrics.confusion_matrix(y_test, y_predict)
cm_logr

array([[410,  25],
       [ 23, 223]])

In [345]:
pd.DataFrame(cm_logr, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,410,25
Actual Onion,23,223


In [347]:
post_logr = pd.DataFrame(X_test.copy())
post_logr['predict'] = y_predict
post_logr['actual'] = y_test

In [321]:
FalsePos_logr = post_logr.loc[(post_logr['predict'] == 1) & (post_logr['actual'] == 0)]

In [325]:
FalseNeg_logr = post_logr.loc[(post_logr['predict'] == 0) & (post_logr['actual'] == 1)]
FalseNeg_logr

Unnamed: 0,title,predict,actual
192,This War Will Destabilize The Entire Mideast R...,0,1
763,ChatGPT Keeps Claiming Its Aunt Is Britney Spears,0,1
526,Archaeologists Uncover First Caves Gentrified ...,0,1
411,U.S. Aid To Israel By The Numbers,0,1
35,Smithsonian Under Fire For Collection Of Nazi ...,0,1
106,Is Pop Music Pop or Poop?,0,1
765,Colorado Pastor Claims The Lord Told Him To De...,0,1
211,Police Department Defends Decision To Buy Enti...,0,1
32,A Day In The Life Of Samuel And Martha-Ann Alito,0,1
862,NRA Narrows Search For New Leadership With Rou...,0,1


In [123]:
pipe_cvrt = Pipeline([
    ('cvec', CountVectorizer()),
    ('rt', RandomForestClassifier(n_estimators = 300, oob_score=True))
])

In [125]:
pipe_cvrt.fit(X_train, y_train)

In [127]:
pipe_cvrt.score(X_test, y_test)

0.9177679882525698

In [283]:
%%time
pgrid = {
    'cvec__ngram_range': [(1, 1), (2, 2)],
    'cvec__stop_words': ['english', sw],
    'rt__max_features': np.arange(1, 21),
    'rt__max_depth': [None, 1, 2, 3, 4]
}

gs2 = GridSearchCV(pipe_cvrt, param_grid=pgrid, cv=5, n_jobs=4)
gs2.fit(X_train, y_train)

CPU times: user 7.99 s, sys: 513 ms, total: 8.5 s
Wall time: 9min 6s


In [285]:
gs2.score(X_test, y_test)

0.9295154185022027

In [287]:
y_gs2_pred = gs2.predict(X_test)
cm3 = metrics.confusion_matrix(y_test, y_gs2_pred)
cm3

array([[397,  38],
       [ 10, 236]])

In [289]:
pd.DataFrame(cm3, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,397,38
Actual Onion,10,236


In [251]:
pipe_et = Pipeline([
    ('tvec', TfidfVectorizer(stop_words='english')),
    ('et', ExtraTreesClassifier(n_estimators=500))
])
pipe_et.fit(X_train, y_train)

In [253]:
pipe_et.score(X_test, y_test)

0.9133627019089574

In [295]:
%%time
pgrid = {
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (2,2)],
    'tvec__min_df': [1, 2, 3],
    'et__max_depth': [None, 1, 2, 3, 4, 5],
}

gs_et = GridSearchCV(pipe_et, param_grid=pgrid, cv=5, n_jobs=4)

gs_et.fit(X_train, y_train)

CPU times: user 5.79 s, sys: 223 ms, total: 6.02 s
Wall time: 2min 5s


In [297]:
gs_et.best_params_

{'et__max_depth': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': None}

In [299]:
gs_et.score(X_test, y_test)

0.9162995594713657

In [301]:
y_gset_pred = gs_et.predict(X_test)
cm_et = metrics.confusion_matrix(y_test, y_gset_pred)
cm_et

array([[402,  33],
       [ 24, 222]])

In [303]:
pd.DataFrame(cm_et, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,402,33
Actual Onion,24,222
