# INLS 613: Fake News Detection

## 0: Import Libraries

In [91]:
import pandas as pd
import numpy as np
import scipy

## 1: Data

### 1.1: Read in Data

In [92]:
df = pd.read_csv("fake_or_real_news.csv/fake_or_real_news.csv",encoding='utf-8')

In [93]:
df.head(2)
# encoding or wierd characters??

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE


In [94]:
df['label'].unique()

array(['FAKE', 'REAL'], dtype=object)

In [95]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [96]:
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

### Preprocessing

downcasing, tokenization, markup removal

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
count_vect = CountVectorizer(lowercase=True)

In [136]:
X_train_counts = count_vect.fit_transform(df)
X_train_counts.shape
X_train_counts

<4x4 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

### 1.2: Convert Labels

In [97]:
from sklearn import preprocessing ### Importing a preprocessor to convert the labels in the target class. 

In [98]:
data_class_y= [ 'FAKE', 'REAL']
le= preprocessing.LabelEncoder()
le.fit(data_class_y)
#appears to make fake 0 and Real 1 no matter what-why?

LabelEncoder()

In [99]:
y=le.transform(df['label']);

In [100]:
y

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

### 1.3: Split Train and Test Data

In [101]:
from sklearn.model_selection import train_test_split

In [124]:
train_x, test_x, train_y, test_y = train_test_split(df['text'], y, train_size=.8, random_state=5)

In [125]:
train_x.shape

(5068,)

In [126]:
train_y.shape

(5068,)

In [127]:
#print("fake: "+train_y.tolist().count("0")+"real: "+ train_y.tolist().count(1))

In [128]:
unique, counts = np.unique(train_y, return_counts=True)

In [129]:
dict(zip(unique, counts))

{0: 2527, 1: 2541}

In [130]:
unique, counts = np.unique(test_y, return_counts=True)
dict(zip(unique, counts))

{0: 637, 1: 630}

### 1.4: Extract Features

In [65]:
train_x = df['text']

In [66]:
train_x.shape

(6335,)

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [38]:
tf = TfidfVectorizer(min_df=1,stop_words='english',max_features=5000)

In [39]:
train_x_tfidf = tf.fit_transform(train_x)

In [40]:
tf.get_feature_names()

['00',
 '000',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1948',
 '1960s',
 '1968',
 '1970s',
 '1980',
 '1980s',
 '1988',
 '1990',
 '1990s',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2020',
 '20th',
 '21',
 '21st',
 '22',
 '23',
 '237',
 '24',
 '25',
 '250',
 '26',
 '27',
 '28',
 '29',
 '30',
 '300',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '400',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '500',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '60',
 '600',
 '61',
 '62',
 '64',
 '65',
 '66',
 '67',
 '68',
 '70',
 '700',
 '75',
 '80',
 '800',
 '85',
 '8th',
 '90',
 '95',
 '99',
 'abandon',
 'abandoned',
 'abc',
 'abdullah',
 'abedin',
 'ability',
 'able'

In [45]:
## Importing a Learning Model

In [46]:
train_x_tfidf_array = train_x_tfidf.toarray()
train_x_tfidf_array[0]


array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [47]:
tf.inverse_transform(train_x_tfidf_array[0]) ## just to check what all features are there. 


[array(['60', 'abedin', 'abuses', 'accused', 'accusing', 'act', 'admits',
        'ads', 'afraid', 'age', 'agency', 'agents', 'ago', 'ahead', 'alive',
        'allegations', 'allies', 'allowed', 'amendment', 'americans',
        'anthony', 'appearance', 'appeared', 'approach', 'article',
        'assault', 'assume', 'attack', 'attacked', 'attacking', 'away',
        'awkward', 'bad', 'badly', 'bathroom', 'behavior', 'belief',
        'believes', 'believing', 'better', 'bigger', 'bizarre', 'born',
        'boston', 'bring', 'bureau', 'buried', 'cable', 'calling',
        'campaign', 'candidate', 'center', 'changed', 'charge', 'claim',
        'claimed', 'claiming', 'classified', 'clinton', 'clintons',
        'closing', 'cnn', 'column', 'come', 'comey', 'comeyâ', 'compared',
        'computer', 'confident', 'continue', 'control', 'corruption',
        'countless', 'country', 'cover', 'credibility', 'crime', 'criminal',
        'currently', 'cycle', 'damaging', 'daniel', 'debate', 'decid

In [48]:
from sklearn.naive_bayes import MultinomialNB

In [49]:
mnb = MultinomialNB(alpha=1.0) # Check what this alpha value is. You have already learnt most of the math to understand this. 
mnb.fit(train_x_tfidf_array,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [50]:
## Test Set Data