# Fake News Case Study

In [1]:
import pandas as pd

fake = pd.read_csv('data/fake.csv')
true = pd.read_csv('data/true.csv')

#Standard exploratory data analysis (EDA) and data cleaning techniques are used to clean the data and prepare it for analysis.
print(fake.shape) # (23481, 4)
print(true.shape) # (21417, 4)

print(fake.columns) 
print(true.columns) 

print(fake.head()) # first 5 rows
print(true.head()) # first 5 rows


(23481, 4)
(21417, 4)
Index(['title', 'text', 'subject', 'date'], dtype='object')
Index(['title', 'text', 'subject', 'date'], dtype='object')
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017 

In [2]:
#Check for blanks and duplicates in fake
print('Blank in fake: ', fake.isnull().sum())
print('Duplicates in fake: ', fake.duplicated().sum())


#check for blanks and duplicates in true
print('Blank in true: ', true.isnull().sum())
print('Duplicates in true: ', true.duplicated().sum())

print(true.info()) # 21417 non-null objects
print(fake.info()) # 23481 non-null objects

#Print unique values in each column
print(true.nunique())
print(fake.nunique())



Blank in fake:  title      0
text       0
subject    0
date       0
dtype: int64
Duplicates in fake:  3
Blank in true:  title      0
text       0
subject    0
date       0
dtype: int64
Duplicates in true:  206
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB
None
title      20826
text       21192
su

In [3]:
#Drop duplicates
fake.drop_duplicates(inplace=True)
true.drop_duplicates(inplace=True)

#Check for blanks and duplicates
print('Duplicates in fake: ', fake.duplicated().sum())
print('Duplicates in true: ', true.duplicated().sum())



Duplicates in fake:  0
Duplicates in true:  0


In [4]:
true['fake'] = 0
fake['fake'] = 1

print(true.head())
print(fake.head())



                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  fake  
0  December 31, 2017      0  
1  December 29, 2017      0  
2  December 31, 2017      0  
3  December 30, 2017      0  
4  December 29, 2017      0  
                                               title  \
0   Do

In [5]:
#Check column names are the same
if (true.columns == fake.columns).all():
    print('Columns are the same')

Columns are the same


In [6]:
news = [true, fake]
news = pd.concat(news, axis=0)
news.head()

Unnamed: 0,title,text,subject,date,fake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [7]:
for col in news.columns:
   if news[col].dtype == 'object':
       news[col] = news[col].str.lower()

news.head()
news.tail()

Unnamed: 0,title,text,subject,date,fake
23476,mcpain: john mccain furious that iran treated ...,21st century wire says as 21wire reported earl...,middle-east,"january 16, 2016",1
23477,justice? yahoo settles e-mail privacy class-ac...,21st century wire says it s a familiar theme. ...,middle-east,"january 16, 2016",1
23478,sunnistan: us and allied ‘safe zone’ plan to t...,patrick henningsen 21st century wireremember ...,middle-east,"january 15, 2016",1
23479,how to blow $700 million: al jazeera america f...,21st century wire says al jazeera america will...,middle-east,"january 14, 2016",1
23480,10 u.s. navy sailors held by iranian military ...,21st century wire says as 21wire predicted in ...,middle-east,"january 12, 2016",1


In [8]:
news.drop(['subject', 'date'], axis=1, inplace=True)
news['text'] = news['title'] + ' ' + news['text']
news.drop(['title'], axis=1, inplace=True)

news.head()

Unnamed: 0,text,fake
0,"as u.s. budget fight looms, republicans flip t...",0
1,u.s. military to accept transgender recruits o...,0
2,senior u.s. republican senator: 'let mr. muell...,0
3,fbi russia probe helped by australian diplomat...,0
4,trump wants postal service to charge 'much mor...,0


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)

In [10]:
from sklearn.model_selection import train_test_split

X = news['text'] #features
y = news['fake'] #target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #split data into training and testing sets

X_train = tfidf.fit_transform(X_train) #fit and transform X_train
X_test = tfidf.transform(X_test) # we don't need to fit the test data


In [11]:

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score, classification_report


models = []

models.append(('Support Vector Machines', SVC(kernel='linear')))
models.append(('Naive Bayes', MultinomialNB()))

best_model = None
best_f1 = 0

for name, model in models:
    model.fit(X_train, y_train)
    print("------------------", str(name),"------------------" )
    y_pred = model.predict(X_test)
    cf = confusion_matrix(y_test, y_pred)
    cf = pd.DataFrame(cf, columns=['Predicted False (Real)', 'Predicted True (Fake)'], index=['Actual False (Real)', 'Actual True (Fake)'])
    print(cf)
    print(classification_report(y_test, y_pred , target_names=['Real', 'Fake']))
    print("\n")
    f1 = f1_score(y_test, y_pred)
    if f1 > best_f1:
        best_model = model
        best_f1 = f1

print("Best Model: ", best_model)
print("Best F1 Score: ", best_f1)