# INLS 613: Fake News Detection

## Outline:

1. Import Data
2. Extract Features
3. Split Train and Test
4. Train Models
5. Evaluation

## 1: Data

### 1.1: Read in Data

In [1]:
import pandas as pd
import numpy as np
import scipy

In [2]:
df = pd.read_csv("fake_or_real_news.csv/fake_or_real_news.csv",encoding='utf-8')

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE


In [4]:
df['label'].unique()

array(['FAKE', 'REAL'], dtype=object)

In [5]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [6]:
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

### 1.2: Preprocessing

#### 1.2.1: Convert Labels

In [7]:
from sklearn import preprocessing ### Importing a preprocessor to convert the labels in the target class. 

In [8]:
data_class_y= [ 'FAKE', 'REAL']
le= preprocessing.LabelEncoder()
le.fit(data_class_y)
#y should now be an array of labels where 0 is FAKE and 1 is REAL
y=le.transform(df['label']);

#### 1.2.2: Downcase text and title

In [9]:
# # lower takes in an array of strings and converts every string to all lower case
# def lower(arr):
#     out=[]
#     for i in range(len(arr)):
#         out.append(arr[i].lower())
#     return out;

In [10]:
# lower_text=lower(df['text'])
# lower_title=lower(df['title'])
# lower_title[0:5]
# #df['title'][0:5]

# 2: Extract Features

### 2.1: Bag of Words

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [12]:
"""
tfidf:
in: train x
out: tfidf vectorizor, train_x_tfidf_array"""

def to_train_x_tfidf_array (train_x, tf):
#     tf = TfidfVectorizer(min_df=1,stop_words='english',max_features=max_feat, lowercase=True)
    train_x_tfidf = tf.fit_transform(train_x)
    train_x_tfidf_array = train_x_tfidf.toarray()
    return train_x_tfidf_array

"""
create_feature_array:
in: tf (tfidf vectorizor)
out: array of features"""

def create_feature_array(tf):
    return np.array(tf.get_feature_names())

"""
sort_tfidf_array:
in: array (train_x_tfidf)
out: sorted array"""
def sort_tfidf_array(train_x_tfidf):
    return  np.argsort(train_x_tfidf).flatten()[::-1]

"""
top_n:
in: tf (tfidf vectorizor), train_x_tfidf array, n (number of features)
out: top_n features"""

def top_n_features(tf, train_x_tfidf, n):
    feature_array=create_feature_array(tf)
    tfidf_sorting=sort_tfidf_array(train_x_tfidf)
    top_n = feature_array[tfidf_sorting][:n]
    return top_n
    
def title_convert(s):
    String[] words = s.split;
    for i in range(len(words)):
        words[i]="title_"+words[i]
    return " ".join(words)

#### 2.1.1 Text

In [13]:
# tf_text, x_text= tfidf(lower_text, 5000)
# x_text[0]
# x_text.shape

In [15]:
tf_text = TfidfVectorizer(min_df=1,stop_words='english',max_features=500, lowercase=True)
text_x_tfidf = tf_text.fit_transform(df['text'])
text_x_tfidf_array = text_x_tfidf.toarray()

In [16]:
#tf_text.get_feature_names()

In [17]:
# top_n_text_features=top_n_features(tf_text, x_text, 500)

In [18]:
# x_text[top_n_text_features]

#### 2.1.2 Title

In [23]:
tf_title = TfidfVectorizer(min_df=1,stop_words='english',max_features=500, lowercase=True, preprocessor= title_convert)
title_x_tfidf = tf_title.fit_transform(df['title'])
title_x_tfidf_array = title_x_tfidf.toarray()

In [24]:
tf_title.get_feature_names()

['000',
 '10',
 '11',
 '16',
 '20',
 '2015',
 '2016',
 'About',
 'Access',
 'Act',
 'After',
 'Against',
 'Aleppo',
 'All',
 'America',
 'American',
 'Americans',
 'An',
 'And',
 'Another',
 'Anti',
 'Arabia',
 'Are',
 'As',
 'At',
 'Attack',
 'Attacks',
 'Back',
 'Be',
 'Before',
 'Behind',
 'Bernie',
 'Biden',
 'Big',
 'Bill',
 'Black',
 'Boehner',
 'Bush',
 'But',
 'By',
 'CLINTON',
 'California',
 'Calls',
 'Campaign',
 'Can',
 'Carson',
 'Case',
 'Caught',
 'Change',
 'Children',
 'China',
 'Christian',
 'Clinton',
 'Clintons',
 'Comey',
 'Congress',
 'Control',
 'Could',
 'Court',
 'Cruz',
 'DNC',
 'DOJ',
 'Daily',
 'Dakota',
 'Day',
 'Dead',
 'Deal',
 'Debate',
 'Democratic',
 'Democrats',
 'Dems',
 'Department',
 'Did',
 'Director',
 'Do',
 'Don',
 'Donald',
 'Down',
 'Election',
 'Elections',
 'Email',
 'Emails',
 'End',
 'Europe',
 'Exposed',
 'FBI',
 'Facebook',
 'Family',
 'Finds',
 'Finest',
 'Fiorina',
 'First',
 'Florida',
 'For',
 'Foreign',
 'Found',
 'Foundation',
 'F

In [53]:
# tf_title, x_title=tfidf(lower_title, 1000)


In [56]:
top_n_title_features=top_n_features(tf_title, x_title, 100)

In [57]:
top_n_title_features

array(['matters', 'jeb', 'bush', 'trump', 'final', 'ferguson', 'field',
       'fight', 'fighting', 'financial', 'finally', 'feds', 'finds',
       'finest', 'fiorina', 'fired', 'female', 'zika', 'flight', 'federal',
       'fears', 'fear', 'fbi', 'far', 'fans', 'family', 'false', 'fall',
       'fake', 'failure', 'failed', 'facts', 'fix', 'florida', 'faces',
       'focus', 'gives', 'getting', 'gets', 'germany', 'george', 'general',
       'gay', 'gas', 'gary', 'game', 'future', 'funding', 'french',
       'freedom', 'free', 'fraud', 'francis', 'france', 'fox',
       'foundation', 'forward', 'foreign', 'forces', 'forced', 'force',
       'food', 'following', 'fact', 'face', 'facebook', 'dr', 'elect',
       'economy', 'economic', 'easy', 'eastern', 'east', 'earth', 'early',
       'duke', 'drugs', 'drops', 'drone', 'dream', 'donors', 'global',
       'donald', 'don', 'dollar', 'doj', 'doing', 'dog', 'doesn', 'does',
       'dnc', 'discovered', 'director', 'different', 'die', 'elected

In [52]:
tf_title.get_feature_names()

['2016',
 'america',
 'american',
 'americans',
 'anti',
 'attack',
 'bernie',
 'big',
 'black',
 'breaking',
 'bush',
 'calls',
 'campaign',
 'change',
 'clinton',
 'comey',
 'comment',
 'congress',
 'court',
 'cruz',
 'day',
 'dead',
 'deal',
 'debate',
 'democratic',
 'democrats',
 'don',
 'donald',
 'election',
 'email',
 'emails',
 'end',
 'fbi',
 'fight',
 'foreign',
 'gop',
 'government',
 'hillary',
 'house',
 'immigration',
 'investigation',
 'iran',
 'isis',
 'jeb',
 'just',
 'like',
 'make',
 'media',
 'new',
 'news',
 'nuclear',
 'obama',
 'obamacare',
 'party',
 'paul',
 'people',
 'plan',
 'police',
 'political',
 'politics',
 'poll',
 'president',
 'presidential',
 'putin',
 'race',
 'real',
 'really',
 'report',
 'republican',
 'republicans',
 'right',
 'rubio',
 'russia',
 'russian',
 'sanders',
 'say',
 'says',
 'senate',
 'state',
 'stop',
 'supreme',
 'syria',
 'ted',
 'things',
 'time',
 'trump',
 'video',
 'vote',
 'voters',
 'voting',
 'war',
 'watch',
 'white',


## 3: Split Train and Test Data

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
#split_data splits up the data into training and test sets
#it will return: train_x, test_x, train_y, test_y
#example usage: train_x, test_x, train_y, test_y = split_data(df['text'],y)
def split_data(x_features, y_class):
    return train_test_split(x_features, y_class, train_size=.8, random_state=5)

In [22]:
train_x, test_x, train_y, test_y = train_test_split(df['text'], y, train_size=.8, random_state=5)

In [127]:
#print("fake: "+train_y.tolist().count("0")+"real: "+ train_y.tolist().count(1))

In [128]:
unique, counts = np.unique(train_y, return_counts=True)

In [129]:
##counting distribution of classes in train
dict(zip(unique, counts))

{0: 2527, 1: 2541}

In [130]:
##counting distribution of classes in test
unique, counts = np.unique(test_y, return_counts=True)
dict(zip(unique, counts))

{0: 637, 1: 630}

## 4: Models

In [48]:
from sklearn.naive_bayes import MultinomialNB

In [49]:
mnb = MultinomialNB(alpha=1.0) # Check what this alpha value is. You have already learnt most of the math to understand this. 
mnb.fit(train_x_tfidf_array,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [50]:
## Test Set Data

## 5: Evaluation