In [1]:
import pandas as pd
from sklearn.utils import shuffle
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Loading the data

In [2]:
fake = pd.read_csv("news/Fake.csv")
true = pd.read_csv("news/True.csv")

# creating the traget variable

In [3]:
fake['target'] = 'fake'
true['target'] = 'true'

In [4]:
news = pd.concat([fake, true]).reset_index(drop = True)

# Shuffling the data to prevent bias

In [5]:
news = shuffle(news)
news = news.reset_index(drop=True)

In [6]:
news.drop(["date"],axis=1,inplace=True)

In [7]:
news.head()

Unnamed: 0,title,text,subject,target
0,The Netherlands Just TROLLED Trump In His Own...,While Donald Trump likes to go around saying h...,News,fake
1,Trump acknowledges he is under investigation i...,WASHINGTON (Reuters) - President Donald Trump ...,politicsNews,true
2,TRUMP’S BRINGING Churchill’s Bust Back To Oval...,Trump will bring Churchill back to the White H...,politics,fake
3,"'I am sorry,' British PM May says of botched e...","MANCHESTER, England (Reuters) - British Prime ...",worldnews,true
4,Trump to tap nuclear industry lobbyist for U.S...,WASHINGTON (Reuters) - President Donald Trump ...,politicsNews,true


In [8]:
labels=news.target
labels.head()

0    fake
1    true
2    fake
3    true
4    true
Name: target, dtype: object

# Splitiing data to train and test sets

In [9]:
x_train,x_test,y_train,y_test=train_test_split(news['text'], labels, test_size=0.2, random_state=7)

# Initializing TF-idf Vectorizer to generate the matrix of TF-IDF features on the news text.
Fit and transform on train set and transform on test test

In [10]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

# Passive Agressive Classifier works well for classifying large text data streams. 

In [11]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)


PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=50, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

# Predicting the class for test set and calculating accuracy

In [12]:
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 99.44%


In [13]:
confusion_matrix(y_test,y_pred, labels=['fake','true'])

array([[4661,   32],
       [  18, 4269]], dtype=int64)