## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

## Data Preprocessing

In [2]:
# load data
df = pd.read_csv("news.csv")
print(f"Shape: {df.shape}")

Shape: (6335, 4)


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
# Let's check the 'Unnamed: 0' column because its just seems an'id' column which in of no use for our project
df['Unnamed: 0'].nunique()

6335

So its true, lets drop this columns

In [5]:
df = df.drop('Unnamed: 0', axis = 1)

In [6]:
# checking if the null values are present
df.isnull().sum()

title    0
text     0
label    0
dtype: int64

-------------------------------------------
No null values, move on to next one then...

In [8]:
# ckeck if data is balanced or not
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

---------------
It's perfectly balanced so we can use 'accuracy' as a metrics

In [9]:
len(df['text'][0])

7518

In [14]:
len(df['title'][0])

28

The size of text in 'title' columns is small so we use 'text' column as input feature and 'label' as output 

In [15]:
X = df['text']
y = df['label']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

In [18]:
# initialize vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [19]:
# fit vectorizer
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [24]:
X_train_vec.shape

(5068, 61633)

In [32]:
# Initialize a classifier
model=PassiveAggressiveClassifier(max_iter=70)
model.fit(X_train_vec, y_train)

# Predict on the test set
y_pred=model.predict(X_test_vec)

# calculate accuracy of the model
score=metrics.accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

metrics.confusion_matrix(y_pred, y_test, labels=['REAL', 'FAKE'])

Accuracy: 94.87%


array([[585,  33],
       [ 32, 617]], dtype=int64)

## Bonus ->
#### Let's check with some more algorithms

In [35]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a classifier
model= RandomForestClassifier()
model.fit(X_train_vec, y_train)

# Predict on the test set
y_pred=model.predict(X_test_vec)

# calculate accuracy of the model
score=metrics.accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

metrics.confusion_matrix(y_pred, y_test, labels=['REAL', 'FAKE'])


Accuracy: 90.84%


array([[573,  72],
       [ 44, 578]], dtype=int64)