In [None]:
# importing necessary libraries and tools

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
# reading the train.csv file

In [None]:
df=pd.read_csv('/content/sample_data/train.csv')
conversion_dict = {0: 'Real', 1: 'Fake'}
df['label'] = df['label'].replace(conversion_dict)
df.label.value_counts()

Fake    10413
Real    10387
Name: label, dtype: int64

In [None]:
# splitting dataset into train and test sets and performing fit and transform on the sets

In [None]:
x_train,x_test,y_train,y_test=train_test_split(df['text'], df['label'], test_size=0.20, random_state=7, shuffle=True)
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.80)

In [None]:
vec_train=tfidf_vectorizer.fit_transform(x_train.values.astype('U')) 
vec_test=tfidf_vectorizer.transform(x_test.values.astype('U'))

In [None]:
# calling the PassiveAggressiveClassifier to build our ML model

In [None]:
pa=PassiveAggressiveClassifier(max_iter=50)
pa.fit(vec_train,y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=50, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [None]:
# calculating the accuracy of PassiveAggressiveClassifier

In [None]:
y_pred=pa.predict(vec_test)
score=accuracy_score(y_test,y_pred)
print(f'PassiveAggressive Classifier Accuracy: {round(score*100,2)}%')

PassiveAggressive Classifier Accuracy: 96.68%


In [None]:
# generating the confusion matrix

In [None]:
confusion_matrix(y_test,y_pred, labels=['Real','Fake'])

array([[1986,   74],
       [  64, 2036]])

In [None]:
# Performing the model with complete unseen dataset

In [None]:
df_true=pd.read_csv('/content/sample_data/True.csv')
df_true['label']='Real'
df_true_rep=[df_true['text'][i].replace('WASHINGTON (Reuters) - ','').replace('LONDON (Reuters) - ','').replace('(Reuters) - ','') for i in range(len(df_true['text']))]
df_true['text']=df_true_rep
df_fake=pd.read_csv('/content/sample_data/Fake.csv')
df_fake['label']='Fake'
df_final=pd.concat([df_true,df_fake])
df_final=df_final.drop(['subject','date'], axis=1)
df_fake

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",Fake
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",Fake
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",Fake
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",Fake
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",Fake


In [None]:
def flabel(newtext):
    vec_newtest=tfidf_vectorizer.transform([newtext])
    y_pred1=pa.predict(vec_newtest)
    return y_pred1[0]

In [None]:
flabel((df_true['text'][0]))

'Real'

In [None]:
sum([1 if flabel((df_true['text'][i]))=='Real' else 0 for i in range(len(df_true['text']))])/df_true['text'].size

In [None]:
sum([1 if flabel((df_fake['text'][i]))=='Fake' else 0 for i in range(len(df_fake['text']))])/df_fake['text'].size