# INLS 613 Final Project: Fake News

## 1: Data

### 1.1: Read in Data

In [34]:
import pandas as pd
import numpy as np
import scipy

In [35]:
df = pd.read_csv("fake_or_real_news.csv/fake_or_real_news.csv",encoding='utf-8')

In [36]:
df['label'].unique()

array(['FAKE', 'REAL'], dtype=object)

In [37]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [38]:
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

### 1.2: Preprocessing

#### 1.2.1: Convert Labels

In [39]:
from sklearn import preprocessing ### Importing a preprocessor to convert the labels in the target class. 

In [40]:
data_class_y= [ 'FAKE', 'REAL']
le= preprocessing.LabelEncoder()
le.fit(data_class_y)
#y should now be an array of labels where 0 is FAKE and 1 is REAL
y=le.transform(df['label']);

#### 1.2.2: Downcase text and title

In [41]:
# lower takes in an array of strings and converts every string to all lower case
def lower(arr):
    out=[]
    for i in range(len(arr)):
        out.append(arr[i].lower())
    return out;

In [42]:
lower_text=lower(df['text'])
lower_title=lower(df['title'])


remove stop words

In [43]:
import nltk
from nltk.corpus import stopwords
def remove_stops(s):
    word_list = s.split(" ");
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    return " ".join(filtered_words)

def remove_all_stops(a):
    out=[]
    for i in range(len(a)):
        out.append(remove_stops(a[i]))
    return out


In [44]:
title_no_stops= remove_all_stops(lower_title)

In [45]:
# title_no_stops[0:5]

#### 1.2.3 Convert and Combine Title and Text

In [46]:
def title_convert(s):
    words = s.split(" ");
    for i in range(len(words)):
        words[i]="title_"+words[i]
    return " ".join(words)

def mult_title_convert(titles):
    new_titles=[]
    for i in range(len(titles)):
        new_titles.append(title_convert(titles[i]))
    return new_titles

def combine_title_text(title, text):
    out=[]
    for i in range(len(text)):
        out.append(title[i]+" "+text[i])
    return out

In [47]:
prefixed_titles=mult_title_convert(title_no_stops)
combined_text_title=combine_title_text(prefixed_titles, lower_text)
# combined_text_title[0]

# 2: Extract Features

### Feature Functions

In [48]:
############################################ Video
##Contains video reference
##in: string of text row
##out: 1 if contains (video) 0 if not
def has_video(s):
    check= s.lower()
    if "video" in check:
        return 1
    else:
        return 0
##video_feature
##in: row from df (passed in as df.iloc[index])
##out: 0 if no video ref 1 if video ref
def video_feature(df_row):
    if has_video(df_row.title) or has_video(df_row.text):
        return 1
    else:
        return 0
    
def has_video_feature_set(df):
    
    out=[]
    for i in range(df.shape[0]):
        out.append(video_feature(df.iloc[i]))
    return out

########################################## External Link

def has_external_link(s):
    check= s.lower()
    if "http" in check:
        return 1
    else:
        return 0

def link_feature(df_row):
    if has_external_link(df_row.title) or has_video(df_row.text):
        return 1
    else:
        return 0

def link_feature_set(df):
    out=[]
    for i in range(df.shape[0]):
        out.append(link_feature(df.iloc[i]))
    return out

    


### 2.0: Feature Sets
1. TFIDF of Text and Title
2. TFIDF of Text
3. TFIDF of Title
4. TFIDF of Text and Title + Bigrams
5. TFIDF of Text + Bigrams
6. TFIDF of Title + Bigrams

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [50]:
## Function to combine feature sets:

In [51]:
##takes in a combo of features sets
def combine_fsets(a):
    np.hstack(a)

### 2.1: TFIDF Title

In [52]:
tf_title = TfidfVectorizer(min_df=1,stop_words='english',max_features=100, lowercase=True)
title_x_tfidf = tf_title.fit_transform(title_no_stops)
title_x_tfidf_array = title_x_tfidf.toarray()
# tf_title.get_feature_names()

### 2.2: TFIDF of Text

In [53]:
tf_text = TfidfVectorizer(min_df=1,stop_words='english',max_features=1500, lowercase=True)
text_x_tfidf = tf_text.fit_transform(lower_text)
text_x_tfidf_array = text_x_tfidf.toarray()


### 2.3 TFIDF of Title and Text 

In [54]:
tf_combined = TfidfVectorizer(min_df=1,stop_words='english',max_features=2000, lowercase=True)
combined_tfidf = tf_combined.fit_transform(combined_text_title)
combined_tfidf_array = combined_tfidf.toarray()

### 2.4 TFIDF  Combined Vectors

In [55]:
combined= np.hstack((title_x_tfidf_array, text_x_tfidf_array))

### 2.5: Video

In [56]:
video_feature_set= has_video_feature_set(df)
video_feature_set_arr= np.asarray(video_feature_set).reshape(-1,1)

### 2.6: External Sources

In [57]:
link_feature_set=link_feature_set(df)
link_feature_set_arr=np.asarray(link_feature_set).reshape(-1,1)

## 4: Models and Evaluation

In [58]:
from sklearn.model_selection import cross_val_score 

Reminder of variables:
y: stores converted labels
combined_tfidf_array: stores feature array of tfidf of titles and text
title_x_tfidf_array
text_x_tfidf_array

### 4.0: Cross Validate Method

In [59]:
"""
name: avg_cross_val
in: 
    classifier (object that implements fit)
    x (features)
    y (labels)
    cv (either number of desired folds or cross validation object)
out: returns average score from array of scores from cross_val_score
"""
def cross_val(classifier, x, y, cv):
    return np.mean(cross_val_score(classifier, x, y, cv=cv))

    

### 4.1: Naive Bayes

In [60]:
from sklearn.naive_bayes import MultinomialNB

#### 4.1.1: TFIDF of Titles

In [61]:
mnb1= MultinomialNB(alpha=1)
cross_val(mnb1, title_x_tfidf_array, y, 10)

0.6632931649088738

#### 4.1.2: TFIDF of Text

In [62]:
mnb2= MultinomialNB(alpha=1)
cross_val(mnb2, text_x_tfidf_array, y, 10)

0.85619072351704395

#### 4.1.3 TFIDF of Text and Titles Combined

In [63]:
mnb3= MultinomialNB(alpha=1)
cross_val(mnb3, combined_tfidf_array, y, 10)

0.86376967665173687

#### 4.1.4: testing out trying to combine two vectors

In [64]:
mnb4= MultinomialNB(alpha=1)
cross_val(mnb4, combined, y, 10)

0.85256146742103966

#### 4.1.5 video feature set

In [65]:
mnb5= MultinomialNB(alpha=1)
cross_val(mnb5, np.asarray(video_feature_set_arr), y, 10)

0.50055267380676938

#### 4.1.6 External link feature set

In [66]:
mnb5= MultinomialNB(alpha=1)
cross_val(mnb5, np.asarray(video_feature_set_arr), y, 10)

0.50055267380676938

### RAndom Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier

#### 4.2.1: TFIDF of Titles

In [37]:
forest1 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest1, title_x_tfidf_array, y, 10)

0.69644833706489817

#### 4.2.2: TFIDF of Text

In [38]:
forest2 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest2, text_x_tfidf_array, y, 10)

0.87135211747222674

#### 4.2.3 TFIDF of Text and Titles Combined

In [39]:
forest3 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest3, combined_tfidf_array, y, 10)

0.87292791026444994

#### 4.4.4: testing out trying to combine two vectors

In [40]:
forest4 = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
cross_val(forest4, combined, y, 10)

0.87087818303812981

In [41]:
"""
IDEAS:
put all feature sets in an array
make a features method that takes in diff tuning parameters
make a method that takes in a model and tests it on different featuresets and diff parameters and prints results in chart
""" 

'\nIDEAS:\nput all feature sets in an array\nmake a features method that takes in diff tuning parameters\nmake a method that takes in a model and tests it on different featuresets and diff parameters and prints results in chart\n'

### 4.5: Support Vector Machine

In [42]:
from sklearn import svm
from sklearn.svm import SVC

#### 4.5.1TFIDF of Titles

In [43]:
svm1 = SVC(gamma='auto')
cross_val(svm1, title_x_tfidf_array, y, 10)

0.72738436632284631

#### 4.5.2 TFIDF of Text

In [None]:
svm2=SVC(gamma='auto')
cross_val(svm2, text_x_tfidf_array, y, 10)

0.50055267380676938

#### 4.5.3 TFIDF of Text and Titles Combined

In [None]:
svm3=SVC(gamma='auto')
cross_val(svm3, combined_tfidf_array, y, 10)

#### 4.5.4 TFIDF Combined Vector

In [None]:
svm4=SVC(gamma='auto')
cross_val(svm4, combined, y, 10)

#### Logistic Regression

In [67]:

from sklearn.linear_model import LogisticRegression


In [71]:

#### 4.3.1: TFIDF of Titles


In [72]:

log1 = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial')
cross_val(log1, title_x_tfidf_array, y, 10)


0.73701428864217933

In [73]:

#### 4.3.2: TFIDF of Text

In [74]:


log2 = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial')
cross_val(log2, text_x_tfidf_array, y, 10)


0.91664943660414944

In [75]:
#### 4.3.3 TFIDF of Text and Titles Combined

In [76]:


log3 = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial')
cross_val(log3, combined_tfidf_array, y, 10)


0.92043941309405497

In [77]:
#### 4.3.4: testing out trying to combine two vectors


In [78]:

log4 = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial')
cross_val(log4, combined, y, 10)

0.91917882775380111