# INLS 613 Final Project: Fake News

## 1: Data

### 1.1: Read in Data

In [1]:
import pandas as pd
import numpy as np
import scipy

In [2]:
df = pd.read_csv("fake_or_real_news.csv/fake_or_real_news.csv",encoding='utf-8')

In [3]:
df['label'].unique()

array(['FAKE', 'REAL'], dtype=object)

In [4]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [5]:
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### 1.2: Preprocessing

#### 1.2.1: Convert Labels

In [7]:
from sklearn import preprocessing ### Importing a preprocessor to convert the labels in the target class. 

In [8]:
data_class_y= [ 'FAKE', 'REAL']
le= preprocessing.LabelEncoder()
le.fit(data_class_y)
#y should now be an array of labels where 0 is FAKE and 1 is REAL
y=le.transform(df['label']);

In [9]:
df_x=df.drop(['label'], axis=1)

In [10]:
df_x=df_x.drop(df.columns[0], axis=1)

In [11]:
df_x.head()

Unnamed: 0,title,text
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello..."
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T..."
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...


## 2: Split Train and Test Data

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_x, test_x, train_y, test_y = train_test_split(df_x, y, train_size=.8, random_state=5)

## 3: Extract Features From Train X

### 3.1: Downcase

In [14]:
# lower takes in an array of strings and converts every string to all lower case
def lower(arr):
    out=[]
    for i in range(len(arr)):
        out.append(arr[i].lower())
    return out;

### 3.2: Remove stop words

In [15]:
import nltk
from nltk.corpus import stopwords
def remove_stops(s):
    word_list = s.split(" ");
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    return " ".join(filtered_words)

def remove_all_stops(a):
    out=[]
    for i in range(len(a)):
        out.append(remove_stops(a[i]))
    return out


Call functions to remove stop words and downcase. Store results in "processed_text" and "processed_title"

Train:

In [19]:
processed_text= lower(train_x['text'].values)
processed_title = lower(train_x['title'].values)

Test:

In [26]:
test_processed_text= lower(test_x['text'].values)
test_processed_title = lower(test_x['title'].values)

### 3.3 Add Prefix to Title Words and Concat Title and Text

In [27]:
def title_convert(s):
    words = s.split(" ");
    for i in range(len(words)):
        words[i]="title_"+words[i]
    return " ".join(words)

def mult_title_convert(titles):
    new_titles=[]
    for i in range(len(titles)):
        new_titles.append(title_convert(titles[i]))
    return new_titles

def combine_title_text(title, text):
    out=[]
    for i in range(len(text)):
        out.append(title[i]+" "+text[i])
    return out

Train:

In [29]:
#convert titles to have prepended prefix
prefixed_titles=mult_title_convert(processed_title)

In [30]:
#combine preprocesssed prefixed title and preprocessed text
combined_text_title=combine_title_text(prefixed_titles, processed_text)

Test:

In [31]:
#convert titles to have prepended prefix
test_prefixed_titles=mult_title_convert(test_processed_title)

In [32]:
#combine preprocesssed prefixed title and preprocessed text
test_combined_text_title=combine_title_text(test_prefixed_titles, test_processed_text)

## 4: Extract Features

### Feature Functions

### 4.0: Feature Sets
1. F1: TFIDF Title
2. F2: TFIDF of Text
3. F3: TFIDF of Concatenated Title and Text 
4. F4: TFIDF of Combined Vectors of Individual TFIDF of Text and Title


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer 

### 4.1: TFIDF Title

The TFIDF Vectorizer of the title should fit and transform the title data that contains no stops.  The title tf vectorizer will be stored in tf_title. The vector of training data will be stored in title_x_tfidf_array

In [36]:
tf_title = TfidfVectorizer(min_df=1,stop_words='english',max_features=50, lowercase=True)
title_x_tfidf = tf_title.fit_transform(processed_title)
title_x_tfidf_array = title_x_tfidf.toarray()
#tf_title.get_feature_names()

### 4.2: TFIDF of Text

The  Text TFIDF Vectorizer should fit and transform the processed text data.  The text tf vectorizer will be stored in tf_text. The vector of training data will be stored in text_x_tfidf_array

In [39]:
tf_text = TfidfVectorizer(min_df=1,stop_words='english',max_features=500, lowercase=True)
text_x_tfidf = tf_text.fit_transform(processed_text)
text_x_tfidf_array = text_x_tfidf.toarray()
#tf_text.get_feature_names()


### 4.3 TFIDF of Concatenated Title and Text

In [40]:
tf_combined = TfidfVectorizer(min_df=1,stop_words='english',max_features=600, lowercase=True)
combined_tfidf = tf_combined.fit_transform(combined_text_title)
combined_tfidf_array = combined_tfidf.toarray()

### 4.4 TFIDF  Combined Vectors

In [None]:
combined= np.hstack((title_x_tfidf_array, text_x_tfidf_array))

## 5: Training and Testing Models

### 5.1: Naive Bayes

#### 5.1.1: Naive Bayes and F1

Train:

Test:

#### 5.1.2: Naive Bayes and F2

Train:

Test:

#### 5.1.3: Naive Bayes and F3

Train:

Test:

#### 5.1.4: Naive Bayes and F4

Train:

Test:

### 5.2: Random Forest

#### 5.2.1: Random Forest and F1

Train:

Test:

#### 5.2.2: Random Forest and F2

Train:

Test:

#### 5.2.3: Random Forest and F3

Train:

Test:

#### 5.2.4: Random Forest and F4

Train:

Test:

### 5.3: Logistic Regression

#### 5.3.1: Logistic Regression and F1

Train:

Test:

#### 5.3.2: Logistic Regression and F2

Train:

Test:

#### 5.3.3: Logistic Regression and F3

Train:

Test:

#### 5.3.4: Logistic Regression and F4

Train:

Test:

### 5.4: SVM

#### 5.4.1: SVM and F1

Train:

Test:

#### 5.4.2: SVM and F2

Train:

Test:

#### 5.4.3: SVM and F3

Train:

Test:

#### 5.4.4: SVM and F4

Train:

Test:

In [None]:
######################### end

In [None]:
from sklearn.model_selection import cross_val_score 

Reminder of variables:
y: stores converted labels
combined_tfidf_array: stores feature array of tfidf of titles and text
title_x_tfidf_array
text_x_tfidf_array

### 4.0: Cross Validate Method

In [None]:
"""
name: avg_cross_val
in: 
    classifier (object that implements fit)
    x (features)
    y (labels)
    cv (either number of desired folds or cross validation object)
out: returns average score from array of scores from cross_val_score
"""
def cross_val(classifier, x, y, cv):
    return np.mean(cross_val_score(classifier, x, y, cv=cv))

    

### 4.1: Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

#### 4.1.1: TFIDF of Titles

In [None]:
mnb1= MultinomialNB(alpha=1)
cross_val(mnb1, title_x_tfidf_array, y, 10)

#### 4.1.2: TFIDF of Text

In [None]:
mnb2= MultinomialNB(alpha=1)
cross_val(mnb2, text_x_tfidf_array, y, 10)

#### 4.1.3 TFIDF of Text and Titles Combined

In [None]:
mnb3= MultinomialNB(alpha=1)
cross_val(mnb3, combined_tfidf_array, y, 10)

#### 4.1.4: testing out trying to combine two vectors

In [None]:
mnb4= MultinomialNB(alpha=1)
cross_val(mnb4, combined, y, 10)

#### 4.1.5 video feature set

In [None]:
mnb5= MultinomialNB(alpha=1)
cross_val(mnb5, np.asarray(video_feature_set_arr), y, 10)

#### 4.1.6 External link feature set

In [None]:
mnb5= MultinomialNB(alpha=1)
cross_val(mnb5, np.asarray(video_feature_set_arr), y, 10)

### RAndom Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

#### 4.2.1: TFIDF of Titles

In [None]:
forest1 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest1, title_x_tfidf_array, y, 10)

#### 4.2.2: TFIDF of Text

In [None]:
forest2 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest2, text_x_tfidf_array, y, 10)

#### 4.2.3 TFIDF of Text and Titles Combined

In [None]:
forest3 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest3, combined_tfidf_array, y, 10)

#### 4.4.4: testing out trying to combine two vectors

In [None]:
forest4 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest4, combined, y, 10)

In [None]:
"""
IDEAS:
put all feature sets in an array
make a features method that takes in diff tuning parameters
make a method that takes in a model and tests it on different featuresets and diff parameters and prints results in chart
""" 

### 4.5: Support Vector Machine

In [None]:
from sklearn import svm
from sklearn.svm import SVC

#### 4.5.1TFIDF of Titles

In [None]:
svm1 = SVC(gamma='auto')
cross_val(svm1, title_x_tfidf_array, y, 10)

#### 4.5.2 TFIDF of Text

In [None]:
svm2=SVC(gamma='auto')
cross_val(svm2, text_x_tfidf_array, y, 10)

#### 4.5.3 TFIDF of Text and Titles Combined

In [None]:
svm3=SVC(gamma='auto')
cross_val(svm3, combined_tfidf_array, y, 10)

#### 4.5.4 TFIDF Combined Vector

In [None]:
svm4=SVC(gamma='auto')
cross_val(svm4, combined, y, 10)

#### Logistic Regression

In [None]:

from sklearn.linear_model import LogisticRegression


In [None]:

#### 4.3.1: TFIDF of Titles


In [None]:

log1 = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial')
cross_val(log1, title_x_tfidf_array, y, 10)


In [None]:

#### 4.3.2: TFIDF of Text

In [None]:


log2 = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial')
cross_val(log2, text_x_tfidf_array, y, 10)


In [None]:
#### 4.3.3 TFIDF of Text and Titles Combined

In [None]:


log3 = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial')
cross_val(log3, combined_tfidf_array, y, 10)


In [None]:
#### 4.3.4: testing out trying to combine two vectors


In [None]:

log4 = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial')
cross_val(log4, combined, y, 10)

In [None]:
#### 1.2.2: Downcase

In [None]:
############################################ Video
##Contains video reference
##in: string of text row
##out: 1 if contains (video) 0 if not
def has_video(s):
    check= s.lower()
    if "video" in check:
        return 1
    else:
        return 0
##video_feature
##in: row from df (passed in as df.iloc[index])
##out: 0 if no video ref 1 if video ref
def video_feature(df_row):
    if has_video(df_row.title) or has_video(df_row.text):
        return 1
    else:
        return 0
    
def has_video_feature_set(df):
    
    out=[]
    for i in range(df.shape[0]):
        out.append(video_feature(df.iloc[i]))
    return out

########################################## External Link

def has_external_link(s):
    check= s.lower()
    if "http" in check:
        return 1
    else:
        return 0

def link_feature(df_row):
    if has_external_link(df_row.title) or has_video(df_row.text):
        return 1
    else:
        return 0

def link_feature_set(df):
    out=[]
    for i in range(df.shape[0]):
        out.append(link_feature(df.iloc[i]))
    return out

    


### 2.5: Video

In [None]:
video_feature_set= has_video_feature_set(df)
video_feature_set_arr= np.asarray(video_feature_set).reshape(-1,1)

### 2.6: External Sources

In [None]:
link_feature_set=link_feature_set(df)
link_feature_set_arr=np.asarray(link_feature_set).reshape(-1,1)