# INLS 613: Fake News Detection

## Outline:

1. Import Data
2. Extract Features
3. Split Train and Test
4. Train Models
5. Evaluation

## 1: Data

### 1.1: Read in Data

In [2]:
import pandas as pd
import numpy as np
import scipy

In [3]:
df = pd.read_csv("fake_or_real_news.csv/fake_or_real_news.csv",encoding='utf-8')

In [4]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE


In [5]:
df['label'].unique()

array(['FAKE', 'REAL'], dtype=object)

In [6]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [7]:
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

### 1.2: Preprocessing

#### 1.2.1: Convert Labels

In [8]:
from sklearn import preprocessing ### Importing a preprocessor to convert the labels in the target class. 

In [9]:
data_class_y= [ 'FAKE', 'REAL']
le= preprocessing.LabelEncoder()
le.fit(data_class_y)
#y should now be an array of labels where 0 is FAKE and 1 is REAL
y=le.transform(df['label']);

#### 1.2.2: Downcase text and title

In [10]:
# lower takes in an array of strings and converts every string to all lower case
def lower(arr):
    out=[]
    for i in range(len(arr)):
        out.append(arr[i].lower())
    return out;

In [11]:
lower_text=lower(df['text'])
lower_title=lower(df['title'])


remove stop words

In [26]:
import nltk
from nltk.corpus import stopwords
def remove_stops(s):
    word_list = s.split(" ");
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    return " ".join(filtered_words)

def remove_all_stops(a):
    out=[]
    for i in range(len(a)):
        out.append(remove_stops(a[i]))
    return out


In [27]:
title_no_stops= remove_all_stops(lower_title)

In [29]:
# title_no_stops[0:5]

['smell hillary’s fear',
 'watch exact moment paul ryan committed political suicide trump rally (video)',
 'kerry go paris gesture sympathy',
 "bernie supporters twitter erupt anger dnc: 'we tried warn you!'",
 'battle new york: primary matters']

#### 1.2.3 Convert and Combine Title and Text

In [14]:
def title_convert(s):
    words = s.split(" ");
    for i in range(len(words)):
        words[i]="title_"+words[i]
    return " ".join(words)

def mult_title_convert(titles):
    new_titles=[]
    for i in range(len(titles)):
        new_titles.append(title_convert(titles[i]))
    return new_titles

def combine_title_text(title, text):
    out=[]
    for i in range(len(text)):
        out.append(title[i]+" "+text[i])
    return out

In [30]:
prefixed_titles=mult_title_convert(title_no_stops)
combined_text_title=combine_title_text(prefixed_titles, lower_text)
# combined_text_title[0]



# 2: Extract Features

### 2.0: Feature Sets
1. TFIDF of Text and Title
2. TFIDF of Text
3. TFIDF of Title
4. TFIDF of Text and Title + Bigrams
5. TFIDF of Text + Bigrams
6. TFIDF of Title + Bigrams

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer 

### 2.1: TFIDF of Text and Title

In [42]:
tf_combined = TfidfVectorizer(min_df=1,stop_words='english',max_features=2000, lowercase=True)
combined_tfidf = tf_combined.fit_transform(combined_text_title)
combined_tfidf_array = combined_tfidf.toarray()

In [43]:
tf_combined.get_feature_names()[1500:]

['republicans',
 'request',
 'require',
 'required',
 'research',
 'researchers',
 'reserve',
 'residents',
 'resolution',
 'resources',
 'respect',
 'respond',
 'responded',
 'response',
 'responsibility',
 'responsible',
 'rest',
 'result',
 'results',
 'retired',
 'return',
 'reuters',
 'revealed',
 'review',
 'revolution',
 'rhetoric',
 'rich',
 'richard',
 'rick',
 'rigged',
 'right',
 'rights',
 'rise',
 'rising',
 'risk',
 'rival',
 'rivals',
 'road',
 'robert',
 'rock',
 'role',
 'romney',
 'room',
 'roughly',
 'round',
 'rubio',
 'rule',
 'rules',
 'ruling',
 'run',
 'runner',
 'running',
 'runs',
 'russia',
 'russian',
 'russians',
 'ryan',
 'safe',
 'safety',
 'said',
 'san',
 'sanctions',
 'sanders',
 'saturday',
 'saudi',
 'save',
 'saw',
 'say',
 'saying',
 'says',
 'scale',
 'scalia',
 'scandal',
 'scene',
 'school',
 'schools',
 'science',
 'scientists',
 'scott',
 'sea',
 'search',
 'season',
 'seat',
 'second',
 'secret',
 'secretary',
 'sector',
 'secure',
 'security

### 2.2: TFIDF of Text

In [32]:
tf_text = TfidfVectorizer(min_df=1,stop_words='english',max_features=1500, lowercase=True)
text_x_tfidf = tf_text.fit_transform(lower_text)
text_x_tfidf_array = text_x_tfidf.toarray()


### 2.3 TFIDF of Title

In [37]:
tf_title = TfidfVectorizer(min_df=1,stop_words='english',max_features=500, lowercase=True)
title_x_tfidf = tf_title.fit_transform(title_no_stops)
title_x_tfidf_array = title_x_tfidf.toarray()

### 2.4 TFIDF Text and Title + Bigram

In [38]:
tf_title.get_feature_names()

['000',
 '10',
 '11',
 '12',
 '16',
 '20',
 '2015',
 '2016',
 'abortion',
 'access',
 'act',
 'actually',
 'administration',
 'admits',
 'agenda',
 'ahead',
 'air',
 'al',
 'aleppo',
 'america',
 'american',
 'americans',
 'amid',
 'announces',
 'anti',
 'arabia',
 'assange',
 'assault',
 'attack',
 'attacks',
 'attorney',
 'away',
 'bad',
 'baltimore',
 'battle',
 'benghazi',
 'bernie',
 'best',
 'bid',
 'biden',
 'big',
 'biggest',
 'black',
 'blame',
 'boehner',
 'bombshell',
 'breaking',
 'brexit',
 'budget',
 'bush',
 'california',
 'calls',
 'camp',
 'campaign',
 'cancer',
 'candidate',
 'candidates',
 'care',
 'carson',
 'case',
 'caught',
 'challenge',
 'change',
 'charged',
 'charleston',
 'check',
 'chief',
 'children',
 'china',
 'christian',
 'christie',
 'city',
 'civil',
 'claim',
 'claims',
 'climate',
 'clinton',
 'clintons',
 'close',
 'cnn',
 'collapse',
 'com',
 'come',
 'comey',
 'coming',
 'comment',
 'congress',
 'conservative',
 'control',
 'convention',
 'countr

### 2.5 TFIDF Text + Bigram

### 2.6 TFIDF Title + Bigram

## 3: Split Train and Test Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#split_data splits up the data into training and test sets
#it will return: train_x, test_x, train_y, test_y
#example usage: train_x, test_x, train_y, test_y = split_data(df['text'],y)
def split_data(x_features, y_class):
    return train_test_split(x_features, y_class, train_size=.8, random_state=5)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df['text'], y, train_size=.8, random_state=5)

In [None]:
#print("fake: "+train_y.tolist().count("0")+"real: "+ train_y.tolist().count(1))

In [None]:
unique, counts = np.unique(train_y, return_counts=True)

In [None]:
##counting distribution of classes in train
dict(zip(unique, counts))

In [None]:
##counting distribution of classes in test
unique, counts = np.unique(test_y, return_counts=True)
dict(zip(unique, counts))

## 4: Models

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mnb = MultinomialNB(alpha=1.0) # Check what this alpha value is. You have already learnt most of the math to understand this. 
mnb.fit(train_x_tfidf_array,train_y)

In [None]:
## Test Set Data

## 5: Evaluation