In [68]:
from dotenv import load_dotenv
import os
import praw
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [70]:
news1 = pd.read_csv('data/2024-06-27 15:03:20.894281_news.csv')
onion1 = pd.read_csv('data/2024-06-26 16:40:52.902750_onion.csv')
news2 = pd.read_csv('data/2024-06-28 15:21:52.863539_news.csv')
onion2 = pd.read_csv('data/2024-06-28 15:21:52.863539_onion.csv')
news3 = pd.read_csv('data/2024-07-02 09:23:01.740295_news.csv')
onion3 = pd.read_csv('data/2024-07-02 09:23:01.740295_onion.csv')
news4 = pd.read_csv('data/2024-07-06 10:57:06.699284_news.csv')
onion4 = pd.read_csv('data/2024-07-06 10:57:06.699284_onion.csv')

In [72]:
onion1.drop(columns=['Unnamed: 0'], inplace=True)

In [82]:
onion4.shape

(999, 4)

In [84]:
onion = pd.concat([onion1, onion2, onion3, onion4])

In [86]:
len(onion['created_utc'].unique())

999

In [88]:
onion.drop_duplicates('created_utc', inplace=True)

In [90]:
len(onion['created_utc'].unique())

999

In [92]:
news1.drop(columns=['Unnamed: 0'], inplace=True)

In [94]:
news4.shape

(866, 4)

In [96]:
news = pd.concat([news1, news2, news3, news4])

In [98]:
len(news['created_utc'].unique())

1725

In [100]:
news.drop_duplicates('created_utc', inplace=True)

In [102]:
len(news['created_utc'].unique())

1725

In [104]:
posts = pd.concat([onion, news])
posts['is_onion'] = posts['subreddit'].map({'worldnews': 0, 'TheOnion': 1})

In [106]:
posts.shape

(2724, 5)

In [108]:
posts.to_csv('data/posts.csv')

In [110]:
X = posts['title']
y = posts['is_onion']

In [112]:
1-y.mean()

0.6332599118942731

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [114]:
sw_nltk = stopwords.words('english')
sw = sw_nltk + ['thread', 'worldnews', 'live']

In [116]:
pipe_tvnb = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

pipe_tvnb.fit(X_train, y_train)

In [118]:
params = {
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tvec__stop_words': [None, 'english', sw],
    'tvec__max_features': [None, 2000, 5000, 10000, 20000]
}

gs = GridSearchCV(pipe_tvnb, param_grid=params, cv=5, n_jobs=4)

In [120]:
%%time
gs.fit(X_train, y_train)

CPU times: user 320 ms, sys: 77.7 ms, total: 398 ms
Wall time: 5.86 s


In [122]:
gs.score(X_test, y_test)

0.90625

In [124]:
y_hat = gs.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_hat)
cm

array([[322,  12],
       [ 42, 200]])

In [126]:
pd.DataFrame(cm, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,322,12
Actual Onion,42,200


In [128]:
pipe_cvrt = Pipeline([
    ('cvec', CountVectorizer()),
    ('rt', RandomForestClassifier(n_estimators = 300, oob_score=True))
])

In [130]:
pipe_cvrt.fit(X_train, y_train)

In [132]:
pipe_cvrt.score(X_test, y_test)

0.8767361111111112

In [134]:
y_p = pipe_cvrt.predict(X_test)
cm3 = metrics.confusion_matrix(y_test, y_p)
cm3

array([[291,  43],
       [ 28, 214]])

In [136]:
pipe_cvrt.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer()),
  ('rt', RandomForestClassifier(n_estimators=300, oob_score=True))],
 'verbose': False,
 'cvec': CountVectorizer(),
 'rt': RandomForestClassifier(n_estimators=300, oob_score=True),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'rt__bootstrap': True,
 'rt__ccp_alpha': 0.0,
 'rt__class_weight': None,
 'rt__criterion': 'gini',
 'rt__max_depth': None,
 'rt__max_features': 'sqrt',
 'rt__max_leaf_nodes': None,
 'rt__max_samples': None,
 'rt__min_impurity_decrease': 0.0,
 'rt__min_samples_leaf': 1,
 'rt__min_sa

In [138]:
pgrid = {
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'cvec__stop_words': [None, 'english', sw],
    'rt__max_features': np.arange(1, 31),
    'rt__max_depth': [None, 1, 2, 3, 4, 5, 6, 7, 8],
    'rt__min_samples_leaf': np.arange(1, 31)
}

gs2 = GridSearchCV(pipe_cvrt, param_grid=pgrid, cv=5, n_jobs=2)

In [140]:
gs2.fit(X_train, y_train)

KeyboardInterrupt: 

In [91]:
gs2s.score(X_test, y_test)

0.6180555555555556

In [49]:
pipe_et = Pipeline([
    ('cvec', CountVectorizer()),
    ('et', ExtraTreesClassifier(n_estimators=500))
])
pipe_et.fit(X_train, y_train)

In [57]:
pipe_et.score(X_test, y_test)

0.9166666666666666

In [59]:
y_pred = pipe_et.predict(X_test)
cm2 = metrics.confusion_matrix(y_test, y_pred)
cm2

array([[299,  35],
       [ 13, 229]])

In [65]:
pd.DataFrame(cm2, columns = ['Pred World News', 'Pred Onion'], index = ['Actual World News', 'Actual Onion'])

Unnamed: 0,Pred World News,Pred Onion
Actual World News,299,35
Actual Onion,13,229


In [53]:
pgrid = {
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'cvec__stop_words': [None, 'english', sw],
    'et__max_features': np.arange(1, 31),
    'et__max_depth': [None, 1, 2, 3, 4, 5, 6, 7, 8],
    'et__min_samples_leaf': np.arange(1, 31)
}

gs_et = GridSearchCV(pipe_et, param_grid=pgrid, cv=5, n_jobs=2)

In [63]:
%%time
#gs_et.fit(X_train, y_train)

CPU times: user 6 µs, sys: 103 µs, total: 109 µs
Wall time: 61.8 µs
