# INLS 613: Fake News Detection

## Outline:

1. Import Data
2. Extract Features
3. Split Train and Test
4. Train Models
5. Evaluation

## 1: Data

### 1.1: Read in Data

In [1]:
import pandas as pd
import numpy as np
import scipy

In [2]:
df = pd.read_csv("fake_or_real_news.csv/fake_or_real_news.csv",encoding='utf-8')

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE


In [4]:
df['label'].unique()

array(['FAKE', 'REAL'], dtype=object)

In [5]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [6]:
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

### 1.2: Preprocessing

#### 1.2.1: Convert Labels

In [7]:
from sklearn import preprocessing ### Importing a preprocessor to convert the labels in the target class. 

In [8]:
data_class_y= [ 'FAKE', 'REAL']
le= preprocessing.LabelEncoder()
le.fit(data_class_y)
#y should now be an array of labels where 0 is FAKE and 1 is REAL
y=le.transform(df['label']);

#### 1.2.2: Downcase text and title

In [9]:
# lower takes in an array of strings and converts every string to all lower case
def lower(arr):
    out=[]
    for i in range(len(arr)):
        out.append(arr[i].lower())
    return out;

In [10]:
lower_text=lower(df['text'])
lower_title=lower(df['title'])


remove stop words

In [11]:
import nltk
from nltk.corpus import stopwords
def remove_stops(s):
    word_list = s.split(" ");
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    return " ".join(filtered_words)

def remove_all_stops(a):
    out=[]
    for i in range(len(a)):
        out.append(remove_stops(a[i]))
    return out


In [12]:
title_no_stops= remove_all_stops(lower_title)

In [13]:
# title_no_stops[0:5]

#### 1.2.3 Convert and Combine Title and Text

In [14]:
def title_convert(s):
    words = s.split(" ");
    for i in range(len(words)):
        words[i]="title_"+words[i]
    return " ".join(words)

def mult_title_convert(titles):
    new_titles=[]
    for i in range(len(titles)):
        new_titles.append(title_convert(titles[i]))
    return new_titles

def combine_title_text(title, text):
    out=[]
    for i in range(len(text)):
        out.append(title[i]+" "+text[i])
    return out

In [15]:
prefixed_titles=mult_title_convert(title_no_stops)
combined_text_title=combine_title_text(prefixed_titles, lower_text)
# combined_text_title[0]

# 2: Extract Features

### 2.0: Feature Sets
1. TFIDF of Text and Title
2. TFIDF of Text
3. TFIDF of Title
4. TFIDF of Text and Title + Bigrams
5. TFIDF of Text + Bigrams
6. TFIDF of Title + Bigrams

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer 

### 2.1: TFIDF Title

In [17]:
tf_title = TfidfVectorizer(min_df=1,stop_words='english',max_features=100, lowercase=True)
title_x_tfidf = tf_title.fit_transform(title_no_stops)
title_x_tfidf_array = title_x_tfidf.toarray()
# tf_title.get_feature_names()

### 2.2: TFIDF of Text

In [18]:
tf_text = TfidfVectorizer(min_df=1,stop_words='english',max_features=1500, lowercase=True)
text_x_tfidf = tf_text.fit_transform(lower_text)
text_x_tfidf_array = text_x_tfidf.toarray()


### 2.3 TFIDF of Title and Text 

In [19]:
tf_combined = TfidfVectorizer(min_df=1,stop_words='english',max_features=2000, lowercase=True)
combined_tfidf = tf_combined.fit_transform(combined_text_title)
combined_tfidf_array = combined_tfidf.toarray()

### 2.4 TFIDF  Combined Vectors

In [20]:
combined= np.hstack((title_x_tfidf_array, text_x_tfidf_array))

## 3: Split Train and Test Data

### 3.1: Splitting using train_test_split

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
#split_data splits up the data into training and test sets
#it will return: train_x, test_x, train_y, test_y
#example usage: train_x, test_x, train_y, test_y = split_data(df['text'],y)
def split_data(x_features, y_class):
    return train_test_split(x_features, y_class, train_size=.8, random_state=5)

In [23]:
train_x, test_x, train_y, test_y = train_test_split(df['text'], y, train_size=.8, random_state=5)

In [24]:
#print("fake: "+train_y.tolist().count("0")+"real: "+ train_y.tolist().count(1))

In [25]:
unique, counts = np.unique(train_y, return_counts=True)

In [26]:
##counting distribution of classes in train
dict(zip(unique, counts))

{0: 2527, 1: 2541}

In [27]:
##counting distribution of classes in test
unique, counts = np.unique(test_y, return_counts=True)
dict(zip(unique, counts))

{0: 637, 1: 630}

### 3.2: Splitting data using Stratified Shuffle Split

In [28]:
from sklearn.model_selection import StratifiedShuffleSplit
def sss_split(X,Y):
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

## 4: Models and Evaluation

In [29]:
from sklearn.model_selection import cross_val_score 

Reminder of variables:
y: stores converted labels
combined_tfidf_array: stores feature array of tfidf of titles and text
title_x_tfidf_array
text_x_tfidf_array

### 4.0: Cross Validate Method

In [30]:
"""
name: avg_cross_val
in: 
    classifier (object that implements fit)
    x (features)
    y (labels)
    cv (either number of desired folds or cross validation object)
out: returns average score from array of scores from cross_val_score
"""
def cross_val(classifier, x, y, cv):
    return np.mean(cross_val_score(classifier, x, y, cv=cv))

    

### 4.1: Naive Bayes

In [31]:
from sklearn.naive_bayes import MultinomialNB

#### 4.1.1: TFIDF of Titles

In [32]:
mnb1= MultinomialNB(alpha=1)
cross_val(mnb1, title_x_tfidf_array, y, 10)

0.6632931649088738

#### 4.1.2: TFIDF of Text

In [33]:
mnb2= MultinomialNB(alpha=1)
cross_val(mnb2, text_x_tfidf_array, y, 10)

0.85619072351704395

#### 4.1.3 TFIDF of Text and Titles Combined

In [34]:
mnb3= MultinomialNB(alpha=1)
cross_val(mnb3, combined_tfidf_array, y, 10)

0.86376967665173687

#### 4.1.4: testing out trying to combine two vectors

In [35]:
mnb4= MultinomialNB(alpha=1)
cross_val(mnb4, combined, y, 10)

0.85256146742103966

### 4.2: Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier

#### 4.2.1: TFIDF of Titles

In [37]:
forest1 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest1, title_x_tfidf_array, y, 10)

0.6956599434930546

#### 4.2.2: TFIDF of Text

In [38]:
forest2 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest2, text_x_tfidf_array, y, 10)

0.87245024210731559

#### 4.2.3 TFIDF of Text and Titles Combined

In [39]:
forest3 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest3, combined_tfidf_array, y, 10)

0.87024353291897838

#### 4.4.4: testing out trying to combine two vectors

In [40]:
forest4 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest4, combined, y, 10)

0.86803084663454988

In [41]:
"""
IDEAS:
put all feature sets in an array
make a features method that takes in diff tuning parameters
make a method that takes in a model and tests it on different featuresets and diff parameters and prints results in chart
""" 

'\nIDEAS:\nput all feature sets in an array\nmake a features method that takes in diff tuning parameters\nmake a method that takes in a model and tests it on different featuresets and diff parameters and prints results in chart\n'