# Fake News or Real News

---

## Load in our data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
true = pd.read_csv('./data/true.csv')
true.head()

In [None]:
true.isnull().sum()

## Finding distribution of title word counts

In [None]:
true['title_word_count'] = true['title'].map(lambda x: len(x.split(' ')))

In [None]:
plt.hist(true['title_word_count'], bins = 15, color = 'g')
plt.title('Distribution of Title Word Counts for Real News')
plt.xlabel('Word Count')
plt.ylabel('Number of Titles');

In [None]:
fake = pd.read_csv('./data/fake.csv')
fake.head()

In [None]:
fake.isnull().sum()

In [None]:
fake['title_word_count'] = fake['title'].map(lambda x: len(x.split(' ')))

In [None]:
plt.hist(fake['title_word_count'], bins = 15, color = 'salmon')
plt.title('Distribution of Title Word Counts for Fake News')
plt.xlabel('Word Count')
plt.ylabel('Number of Titles');

In [None]:
def most_freq(df):
    cvec = CountVectorizer(stop_words = 'english')
    cvec.fit(df['title'])
    X_train = cvec.transform(df['title'])
    X_train_df = pd.DataFrame(X_train.toarray(),
                              columns=cvec.get_feature_names())
    top_words = {}
    for i in X_train_df.columns:
        top_words[i] =  X_train_df[i].sum()
    return pd.DataFrame(sorted(top_words.items(), key = lambda x: x[1], reverse = True)).head(10)

In [None]:
common_true = most_freq(true)
common_true

In [None]:
common_fake = most_freq(fake)

In [None]:
# code inspired by 4.05 classification metrics

plt.figure(figsize = (10, 7))

plt.bar(x = common_true[0],
        height = common_true[1],
        color = 'g',
        alpha = 0.6,
        label = 'Real news')
plt.bar(x = common_fake[0],
        height = common_fake[1],
        color = 'salmon',
        alpha = 0.6,
        label = 'Fake news')

plt.xticks(rotation=45)
plt.ylabel('Word Count')
plt.xlabel('Words')
plt.title('Common Words Used in Real and Fake News', fontsize=18)

plt.legend(fontsize=14);

## Concat the two dataframes

In [None]:
true['category'] = 1

In [None]:
fake['category'] = 0

In [None]:
df = pd.concat([true, fake])

In [None]:
df.shape

## Data cleaning

In [None]:
df = df.loc[df['date']!= 'https://100percentfedup.com/served-roy-moore-vietnamletter-veteran-sets-record-straight-honorable-decent-respectable-patriotic-commander-soldier/',]
df = df.loc[df['date']!= 'https://100percentfedup.com/video-hillary-asked-about-trump-i-just-want-to-eat-some-pie/']
df = df.loc[df['date']!= 'https://100percentfedup.com/12-yr-old-black-conservative-whose-video-to-obama-went-viral-do-you-really-love-america-receives-death-threats-from-left/']
df = df.loc[df['date']!= 'https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg']
df = df.loc[df['date']!= 'https://fedup.wpengine.com/wp-content/uploads/2015/04/entitled.jpg']

In [None]:
# Dropped a row with a 'date' url
df.drop([18933], inplace=True)

In [None]:
# Converted 'date' to a datetime pandas format
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Created another column for weekday
df['weekday'] = df['date'].dt.weekday

## Train, test, split

In [None]:
X = df['title']
y = df['category']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
y_test.value_counts(normalize=True)

## Model #1

- `PorterStemmer()` and `CountVectorizer()`
- `LogisticRegression()`

`PorterStemmer` code based on [StackOverflow](https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn) question.

In [None]:
stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

In [None]:
def porter(text):
    return(stemmer.stem(w) for w in analyzer(text))

In [None]:
pipe = Pipeline([
    ('cvec', CountVectorizer(analyzer=porter, stop_words='english')),
    ('logreg', LogisticRegression(max_iter=1000, solver='liblinear'))
])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

## Model #2

- No `PorterStemmer()`

In [None]:
pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words='english')),
    ('logreg', LogisticRegression(max_iter=1000))
])

In [None]:
pipe.fit(X_train, y_train);

In [None]:
pipe.score(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

## Model #3

Lucas's code

- `TfidfVectorizer()`
- `LogisticRegression()`

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(solver = 'liblinear', random_state=42))
])

In [None]:
pipe_params = {
    'tfidf__ngram_range': [(1,2)],
    'tfidf__stop_words': ['english'],    
    'logreg__penalty': [ 'l2'],
    'logreg__C': [ 10],
    'logreg__max_iter' : [ 1000]
    
}

gs = GridSearchCV(pipe,
                  param_grid = pipe_params,
                  cv=5,
                  scoring = 'accuracy',
                  verbose = 1)

gs.fit(X_train, y_train)

print(f'Best cross validation score: {gs.best_score_}')
print(f'Best parameters to use: {gs.best_params_}')
print(f'Testing score: {gs.score(X_test, y_test)}')

## Model #4

- `PorterStemmer()`
- `TfidfVectorizer()`
- `LogisticRegression()`

In [None]:
stemmer = PorterStemmer()
analyzer = TfidfVectorizer().build_analyzer()

In [None]:
def porter(text):
    return(stemmer.stem(w) for w in analyzer(text))

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer=porter)),
    ('logreg', LogisticRegression(solver = 'liblinear'))
])

In [None]:
pipe_params = {
    'tfidf__stop_words': ['english', None],
    'tfidf__max_features': [12_000],
    'tfidf__ngram_range': [(1, 2)],
    'logreg__penalty': ['l2'],
    'logreg__C': [15],
    'logreg__max_iter' : [1000]
    
}

gs = GridSearchCV(pipe,
                  param_grid = pipe_params,
                  cv=5,
                  scoring = 'accuracy',
                  verbose = 1)

gs.fit(X_train, y_train)

print(f'Best cross validation score: {gs.best_score_}')
print(f'Best parameters to use: {gs.best_params_}')
print(f'Testing score: {gs.score(X_test, y_test)}')

## Model #5

- `PorterStemmer()`
- `TfidfVectorizer()`
- `RandomForestClassifier()`

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer=porter)),
    ('rf', RandomForestClassifier(random_state = 42))
])

In [None]:
params = {
    'rf__n_estimators': [100],
    'rf__max_depth': [None, 1, 2],
    'rf__max_features': ['auto', 'log2']
}

gs = GridSearchCV(pipe,
                  param_grid=params,
                  cv=2,
                  scoring='accuracy',
                  verbose=1)

gs.fit(X_train, y_train)

print(f'Best cross validation score: {gs.best_score_}')
print(f'Best parameters to use: {gs.best_params_}')
print(f'Testing score: {gs.score(X_test, y_test)}')

## Results

| Estimators/Classifiers | Model 1 | Model 2 | Model 3 | Model 4 | Model 5 |
|-|:-:|:-:|:-:|:-:|:-:|
| `PorterStemmer()` | X |  |  | X | X |
| `CountVectorizer()` | X | X |  |  |  |
| `TfidfVectorizer()` |  |  | X | X | X |
| `LogisticRegression()` | X | X | X | X |  |
| `RandomForestClassifier()` |  |  |  |  | X |
| Train Score: | 0.9849 | 0.9840 | 0.9505 | 0.9563 | 0.9504 |
| Test Score: | **0.9660** | 0.9529 | 0.9588 | 0.9643 | 0.9623 |

**Hyperparameters used in best score:**

| Estimator/Transformer | Hyperparameter | Set to: |
|-|-|-|
| `CountVectorizer()` | `stop_words` | `english` |
| `LogisticRegression()` | `max_iter` | 1000 |
| `LogisticRegression()` | `solver` | `liblinear` |