In [1]:
from dotenv import load_dotenv
import os
import praw
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
news1 = pd.read_csv('data/2024-06-27 15:03:20.894281_news.csv')
onion1 = pd.read_csv('data/2024-06-26 16:40:52.902750_onion.csv')
news2 = pd.read_csv('data/2024-06-28 15:21:52.863539_news.csv')
onion2 = pd.read_csv('data/2024-06-28 15:21:52.863539_onion.csv')
news3 = pd.read_csv('data/2024-07-02 09:23:01.740295_news.csv')
onion3 = pd.read_csv('data/2024-07-02 09:23:01.740295_onion.csv')
news4 = pd.read_csv('data/2024-07-06 10:57:06.699284_news.csv')
onion4 = pd.read_csv('data/2024-07-06 10:57:06.699284_onion.csv')
news5 = pd.read_csv('data/2024-07-08 17:02:59.738032_news.csv')
onion5 = pd.read_csv('data/2024-07-08 17:02:59.738032_onion.csv')

In [3]:
onion1.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
onion5.shape

(1000, 4)

In [9]:
onion = pd.concat([onion1, onion2, onion3, onion4, onion5])

In [11]:
onion.drop_duplicates('created_utc', inplace=True)

In [13]:
len(onion['created_utc'].unique())

1735

In [15]:
news1.drop(columns=['Unnamed: 0'], inplace=True)

In [17]:
news5.shape

(785, 4)

In [19]:
news = pd.concat([news1, news2, news3, news4, news5])

In [21]:
news.drop_duplicates('created_utc', inplace=True)

In [23]:
len(news['created_utc'].unique())

2510

In [25]:
posts = pd.concat([onion, news])
posts['is_onion'] = posts['subreddit'].map({'worldnews': 0, 'TheOnion': 1})

In [27]:
posts.shape

(4245, 5)

In [29]:
posts.to_csv('data/posts.csv')

In [31]:
X = posts['title']
y = posts['is_onion']

In [33]:
1-y.mean()

0.591283863368669

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [37]:
sw_nltk = stopwords.words('english')
sw = sw_nltk + ['thread', 'worldnews', 'live']

In [39]:
pipe_logr = Pipeline([
    ('tvec', TfidfVectorizer(stop_words='english')),
    ('logr', LogisticRegression(max_iter=1000, penalty=None))
])

In [41]:
pipe_logr.fit(X_train, y_train)

In [43]:
pipe_logr.score(X_test, y_test)

0.9293785310734464

In [49]:
%%time
c_grid = np.logspace(-2, 1, 100)
params = [
    {
        'tvec__stop_words': [None, 'english', sw],
        'tvec__ngram_range': [(1,1), (2,2)],
        'tvec__min_df': [1, 2, 3],
        'logr__penalty': ['l1', 'l2'],
        'logr__C': c_grid
    },
    {
        'tvec__stop_words': [None, 'english', sw],
        'tvec__ngram_range': [(1,1), (2,2)],
        'tvec__min_df': [1, 2, 3],
        'logr__penalty': [None]
    }
]
gs_logr = GridSearchCV(pipe_logr, param_grid=params, cv=5, n_jobs=-1)
gs_logr.fit(X_train, y_train)

9000 fits failed out of a total of 18090.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9000 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/b

CPU times: user 59.3 s, sys: 4.81 s, total: 1min 4s
Wall time: 4min 28s


In [53]:
gs_logr.best_params_

{'logr__C': 8.697490026177835,
 'logr__penalty': 'l2',
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': None}

In [55]:
gs_logr.score(X_test, y_test)

0.9331450094161958

In [57]:
y_predict = gs_logr.predict(X_test)
cm_logr = metrics.confusion_matrix(y_test, y_predict)
cm_logr

array([[580,  22],
       [ 49, 411]])

In [59]:
pd.DataFrame(cm_logr, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,580,22
Actual Onion,49,411


In [61]:
post_logr = pd.DataFrame(X_test.copy())
post_logr['predict'] = y_predict
post_logr['actual'] = y_test

In [63]:
FalsePos_logr = post_logr.loc[(post_logr['predict'] == 1) & (post_logr['actual'] == 0)]

In [65]:
FalseNeg_logr = post_logr.loc[(post_logr['predict'] == 0) & (post_logr['actual'] == 1)]

In [109]:
pipe_cvrt = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('rt', RandomForestClassifier(n_estimators = 300, oob_score=True))
])

In [111]:
pipe_cvrt.fit(X_train, y_train)

In [113]:
pipe_cvrt.score(X_test, y_test)

0.8926553672316384

In [115]:
%%time
pgrid = {
    'tvec__ngram_range': [(1, 1), (2, 2)],
    'tvec__stop_words': [None, 'english', sw],
    'rt__max_features': np.arange(1, 21),
    'rt__max_depth': [None, 1, 2, 3, 4]
}

gs2 = GridSearchCV(pipe_cvrt, param_grid=pgrid, cv=5, n_jobs=4)
gs2.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


CPU times: user 4min 45s, sys: 2min 28s, total: 7min 13s
Wall time: 22min 36s


In [117]:
gs2.score(X_test, y_test)

0.935969868173258

In [119]:
gs2.best_params_

{'rt__max_depth': None,
 'rt__max_features': 3,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'u

In [121]:
y_gs2_pred = gs2.predict(X_test)
cm3 = metrics.confusion_matrix(y_test, y_gs2_pred)

In [123]:
pd.DataFrame(cm3, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,573,29
Actual Onion,39,421


In [67]:
post_rt = pd.DataFrame(X_test.copy())
post_rt['predict'] = y_gs2_pred
post_rt['actual'] = y_test

In [69]:
FalsePos_rt = post_rt.loc[(post_rt['predict'] == 1) & (post_rt['actual'] == 0)]
FalsePos_rt

Unnamed: 0,title,predict,actual
25,Croatia’s tourist hotspots hit by storms with ...,1,0
90,Houthis ban music and singing in weddings,1,0
621,"Japan top court rules transgender woman ""fathe...",1,0
294,Bird flu spreads to tenth Australian poultry farm,1,0
703,White House baffled by Netanyahu's claim Biden...,1,0
65,Whistleblower warned Boeing of flaws in 787 pl...,1,0
109,"'Critical' vulnerability in OpenSSH uncovered,...",1,0
61,Milei’s reforms set to cross finish line in Co...,1,0
316,Dagestan’s muftiate issues temporary fatwa ban...,1,0
89,Over 400 migrants brought to the U.S. by an IS...,1,0


In [71]:
FalseNeg_rt = post_rt.loc[(post_rt['predict'] == 0) & (post_rt['actual'] == 1)]
FalseNeg_rt

Unnamed: 0,title,predict,actual
192,This War Will Destabilize The Entire Mideast R...,0,1
411,U.S. Aid To Israel By The Numbers,0,1
862,NRA Narrows Search For New Leadership With Rou...,0,1
8,Hurricane Ashley,0,1
636,World Leaders On Navalny Death: 'Putin Will Go...,0,1
706,Oglala Sioux Tribe Bans South Dakota Gov. From...,0,1
87,Failed Government Oversight of Paranoid Schizo...,0,1
987,New Law Requires Flight Passengers To Go At Le...,0,1


In [91]:
pipe_et = Pipeline([
    ('tvec', TfidfVectorizer(stop_words='english')),
    ('et', ExtraTreesClassifier(n_estimators=500))
])
pipe_et.fit(X_train, y_train)

In [93]:
pipe_et.score(X_test, y_test)

0.9209039548022598

In [95]:
%%time
pgrid = {
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (2,2)],
    'tvec__min_df': [1, 2, 3],
    'et__max_depth': [None, 1, 2, 3, 4, 5],
}

gs_et = GridSearchCV(pipe_et, param_grid=pgrid, cv=5, n_jobs=4)

gs_et.fit(X_train, y_train)

CPU times: user 1min 5s, sys: 48.1 s, total: 1min 53s
Wall time: 4min 13s


In [97]:
gs_et.best_params_

{'et__max_depth': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': None}

In [99]:
gs_et.score(X_test, y_test)

0.9124293785310734

In [101]:
y_gset_pred = gs_et.predict(X_test)
cm_et = metrics.confusion_matrix(y_test, y_gset_pred)
cm_et

array([[562,  40],
       [ 53, 407]])

In [103]:
pd.DataFrame(cm_et, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,562,40
Actual Onion,53,407


In [105]:
post_et = pd.DataFrame(X_test.copy())
post_et['predict'] = y_gset_pred
post_et['actual'] = y_test

In [91]:
FalsePos_et = post_et.loc[(post_et['predict'] == 1) & (post_et['actual'] == 0)]
FalsePos_et

Unnamed: 0,title,predict,actual
103,"Saudi Arabia announces discovery of seven oil,...",1,0
294,Bird flu spreads to tenth Australian poultry farm,1,0
703,White House baffled by Netanyahu's claim Biden...,1,0
96,\nBritish Columbia investigates claims that en...,1,0
52,Albanian authorities strip ethnic Greek mayor ...,1,0
836,Sierra Leone bets on $150 million digital hub ...,1,0
183,"Vatican is going solar, Pope to transition Cit...",1,0
333,Amsterdam overtourism: City moves to ban cruis...,1,0
98,Head of Canada's spy agency announces he's ste...,1,0
80,Czech Republic lifts ban on gay men donating b...,1,0


In [93]:
FalseNeg_et = post_et.loc[(post_et['predict'] == 0) & (post_et['actual'] == 1)]
FalseNeg_et

Unnamed: 0,title,predict,actual
192,This War Will Destabilize The Entire Mideast R...,0,1
763,ChatGPT Keeps Claiming Its Aunt Is Britney Spears,0,1
526,Archaeologists Uncover First Caves Gentrified ...,0,1
390,Cake Left Out In Break Room With No Instructions,0,1
549,Apartment Listing Counts Toilet As Storage,0,1
411,U.S. Aid To Israel By The Numbers,0,1
432,Irresponsible Millennial Wasting Money On Coff...,0,1
765,Colorado Pastor Claims The Lord Told Him To De...,0,1
211,Police Department Defends Decision To Buy Enti...,0,1
559,Authorities Called In Glasgow ‘Willy Wonka’ Ex...,0,1
