# INLS 613: Fake News Detection

## Outline:

1. Import Data
2. Extract Features
3. Split Train and Test
4. Train Models
5. Evaluation

## 1: Data

### 1.1: Read in Data

In [1]:
import pandas as pd
import numpy as np
import scipy

In [2]:
df = pd.read_csv("fake_or_real_news.csv/fake_or_real_news.csv",encoding='utf-8')

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE


In [4]:
df['label'].unique()

array(['FAKE', 'REAL'], dtype=object)

In [5]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [7]:
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

### 1.2: Preprocessing

#### 1.2.1: Convert Labels

In [8]:
from sklearn import preprocessing ### Importing a preprocessor to convert the labels in the target class. 

In [9]:
data_class_y= [ 'FAKE', 'REAL']
le= preprocessing.LabelEncoder()
le.fit(data_class_y)
#y should now be an array of labels where 0 is FAKE and 1 is REAL
y=le.transform(df['label']);

#### 1.2.2: Downcase text and title

In [56]:
# lower takes in an array of strings and converts every string to all lower case
def lower(arr):
    out=[]
    for i in range(len(arr)):
        out.append(arr[i].lower())
    return out;

In [58]:
lower_text=lower(df['text'])
lower_title=lower(df['title'])
lower_title[0:5]
#df['title'][0:5]

# 2: Extract Features

### 2.1: Bag of Words

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [None]:
def tfidf (train_x):
    tf = TfidfVectorizer(min_df=1,stop_words='english',max_features=5000)
    train_x_tfidf = tf.fit_transform(train_x)
    train_x_tfidf_array = train_x_tfidf.toarray()
    return train_x_tfidf_array
    
    

#### 2.1.1 Text

In [None]:
x_text= tfidf(lower_text)

#### 2.1.2 Title

In [None]:
x_title=tfidf(lower_title)

## 3: Split Train and Test Data

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
#split_data splits up the data into training and test sets
#it will return: train_x, test_x, train_y, test_y
#example usage: train_x, test_x, train_y, test_y = split_data(df['text'],y)
def split_data(x_features, y_class):
    return train_test_split(x_features, y_class, train_size=.8, random_state=5)

In [22]:
train_x, test_x, train_y, test_y = train_test_split(df['text'], y, train_size=.8, random_state=5)

In [127]:
#print("fake: "+train_y.tolist().count("0")+"real: "+ train_y.tolist().count(1))

In [128]:
unique, counts = np.unique(train_y, return_counts=True)

In [129]:
##counting distribution of classes in train
dict(zip(unique, counts))

{0: 2527, 1: 2541}

In [130]:
##counting distribution of classes in test
unique, counts = np.unique(test_y, return_counts=True)
dict(zip(unique, counts))

{0: 637, 1: 630}

## 4: Models

In [48]:
from sklearn.naive_bayes import MultinomialNB

In [49]:
mnb = MultinomialNB(alpha=1.0) # Check what this alpha value is. You have already learnt most of the math to understand this. 
mnb.fit(train_x_tfidf_array,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [50]:
## Test Set Data

## 5: Evaluation