# INLS 613: Fake News Detection

## Outline:

1. Import Data
2. Extract Features
3. Split Train and Test
4. Train Models
5. Evaluation

## 1: Data

### 1.1: Read in Data

In [1]:
import pandas as pd
import numpy as np
import scipy

In [2]:
df = pd.read_csv("fake_or_real_news.csv/fake_or_real_news.csv",encoding='utf-8')

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE


In [4]:
df['label'].unique()

array(['FAKE', 'REAL'], dtype=object)

In [5]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [6]:
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

### 1.2: Preprocessing

#### 1.2.1: Convert Labels

In [7]:
from sklearn import preprocessing ### Importing a preprocessor to convert the labels in the target class. 

In [8]:
data_class_y= [ 'FAKE', 'REAL']
le= preprocessing.LabelEncoder()
le.fit(data_class_y)
#y should now be an array of labels where 0 is FAKE and 1 is REAL
y=le.transform(df['label']);

#### 1.2.2: Downcase text and title

In [9]:
# lower takes in an array of strings and converts every string to all lower case
def lower(arr):
    out=[]
    for i in range(len(arr)):
        out.append(arr[i].lower())
    return out;

In [10]:
lower_text=lower(df['text'])
lower_title=lower(df['title'])


In [15]:
def title_convert(s):
    words = s.split(" ");
    for i in range(len(words)):
        words[i]="title_"+words[i]
    return " ".join(words)

def mult_title_convert(titles):
    new_titles=[]
    for i in range(len(titles)):
        new_titles.append(title_convert(titles[i]))
    return new_titles

def combine_title_text(title, text):
    out=[]
    for i in range(len(text)):
        out.append(title[i]+text[i])
    return out

In [16]:
prefixed_titles=mult_title_convert(lower_title)

In [17]:
prefixed_titles[0:1]

['title_you title_can title_smell title_hillary’s title_fear']

In [18]:
combined_text_title=combine_title_text(prefixed_titles, lower_text)

In [19]:
combined_text_title[0:2]

 'title_watch title_the title_exact title_moment title_paul title_ryan title_committed title_political title_suicide title_at title_a title_trump title_rally title_(video)google pinterest digg linkedin reddit stumbleupon print delicious pocket tumblr \nthere are two fundamental truths in this world: paul ryan desperately wants to be president. and paul ryan will never be president. today proved it. \nin a particularly staggering example of political cowardice, paul ryan re-re-re-reversed course and announced that he was back on the trump train after all. this was an aboutface from where he was a few weeks ago. he had previously declared he would not be supporting or defending trump after a tape was made public in which trump bragged about assaulting women. suddenly, ryan was appearing at a pro-trump rally and boldly declaring that he already sent in his vote to make him president of the united states. it was a surreal moment. the figurehead of the republican party dosed himself in gaso

# 2: Extract Features

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer 

### 2.1: TFIDF of Text and Title

In [31]:
tf_combined = TfidfVectorizer(min_df=1,stop_words='english',max_features=1500, lowercase=True)
combined_tfidf = tf_combined.fit_transform(combined_text_title)
combined_tfidf_array = combined_tfidf.toarray()

### 2.2: TFIDF of Text

In [32]:
tf_text = TfidfVectorizer(min_df=1,stop_words='english',max_features=1500, lowercase=True)
text_x_tfidf = tf_text.fit_transform(lower_text)
text_x_tfidf_array = text_x_tfidf.toarray()


### 2.3 TFIDF of Title

In [33]:
tf_title = TfidfVectorizer(min_df=1,stop_words='english',max_features=1500, lowercase=True)
title_x_tfidf = tf_title.fit_transform(lower_title)
title_x_tfidf_array = title_x_tfidf.toarray()

## 3: Split Train and Test Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#split_data splits up the data into training and test sets
#it will return: train_x, test_x, train_y, test_y
#example usage: train_x, test_x, train_y, test_y = split_data(df['text'],y)
def split_data(x_features, y_class):
    return train_test_split(x_features, y_class, train_size=.8, random_state=5)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df['text'], y, train_size=.8, random_state=5)

In [None]:
#print("fake: "+train_y.tolist().count("0")+"real: "+ train_y.tolist().count(1))

In [None]:
unique, counts = np.unique(train_y, return_counts=True)

In [None]:
##counting distribution of classes in train
dict(zip(unique, counts))

In [None]:
##counting distribution of classes in test
unique, counts = np.unique(test_y, return_counts=True)
dict(zip(unique, counts))

## 4: Models

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mnb = MultinomialNB(alpha=1.0) # Check what this alpha value is. You have already learnt most of the math to understand this. 
mnb.fit(train_x_tfidf_array,train_y)

In [None]:
## Test Set Data

## 5: Evaluation