In [29]:
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import re
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split


## DATA COLLECTION AND MERGING

In [30]:
df0=pd.read_csv('Fake.csv')
df1=pd.read_csv('True.csv')

In [31]:
df0['label']=0
df1['label']=1

In [32]:
df0.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [33]:
df_merge=pd.concat([df0,df1],axis=0)

In [34]:
df_merge

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [35]:
df_merge.drop(["title","subject","date"],axis=1,inplace=True)
df_merge.sample(frac=1)


Unnamed: 0,text,label
359,WASHINGTON (Reuters) - U.S. Senate Republican ...,1
3715,WASHINGTON (Reuters) - House of Representative...,1
6957,WASHINGTON (Reuters) - U.S. President-elect Do...,1
5465,MOSCOW (Reuters) - Senior Russian lawmakers sa...,1
19259,WELLINGTON (Reuters) - New Zealand National Pa...,1
...,...,...
289,WASHINGTON (Reuters) - People would be able to...,1
10544,WASHINGTON (Reuters) - The U.S. Supreme Court ...,1
15334,Al Shartpon will be traveling never mind SMOKE...,0
18564,AUSTIN A shoving match and war of words near...,0


In [36]:
df3=pd.read_csv('fake_and_real_news.csv')

In [37]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df3['label']=le.fit_transform(df3['label'])

In [38]:
df3.rename(columns={'Text':'text'},inplace=True)

In [39]:
df=pd.concat([df_merge,df3],axis=0)


In [40]:
df

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,0
9896,Trump consults Republican senators on Fed chie...,1
9897,Trump lawyers say judge lacks jurisdiction for...,1
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,0


In [41]:
df = df.sample(frac=1).reset_index(drop=True)

In [42]:
df

Unnamed: 0,text,label
0,,0
1,Consortium News Exclusive: The neocon royalty ...,0
2,LONDON (Reuters) - Survivors of a blaze that k...,1
3,What This Poll Says About Bernie Sanders And ...,0
4,WASHINGTON (Reuters) - A new U.S. rule aimed a...,1
...,...,...
54793,WASHINGTON (Reuters) - The U.S. Supreme Court ...,1
54794,It s about time! The White House has been leak...,0
54795,NEW YORK (Reuters) - Former U.S. Congressman A...,1
54796,Probably the most highly coveted endorsement b...,0


In [43]:
df['text'].drop_duplicates(inplace=True)

In [44]:
df.shape

(54798, 2)

In [45]:
df['label'].value_counts()

label
0    28481
1    26317
Name: count, dtype: int64

In [46]:
## DATA CLEANING

In [47]:
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with a single space
    return text

In [48]:
df['text']=df['text'].apply(clean_text)

In [49]:
print(df['text'])

0                                                         
1        consortium news exclusive the neocon royalty k...
2        london reuters survivors of blaze that killed ...
3         what this poll says about bernie sanders and ...
4        washington reuters new s rule aimed at protect...
                               ...                        
54793    washington reuters the s supreme court is set ...
54794    it about time the white house has been leaking...
54795    new york reuters former s congressman anthony ...
54796    probably the most highly coveted endorsement b...
54797    moscow reuters the scandal that erupted in the...
Name: text, Length: 54798, dtype: object


In [50]:
x=df['text']
y=df['label']

## VECTORIZATION

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
vector=TfidfVectorizer()
x=vector.fit_transform(x)

In [52]:

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=23)

In [53]:
x_train.shape

(41098, 122098)

In [54]:
x_test.shape

(13700, 122098)

## LOGESTIC REGRESSION

In [55]:
from sklearn.linear_model import LogisticRegression
model_lr=LogisticRegression()

model_lr.fit(x_train,y_train)

y_pred_lr=model_lr.predict(x_test)

print(y_pred_lr)

[1 0 0 ... 1 0 0]


In [56]:
score_lr=accuracy_score(y_pred_lr,y_test)
print(score_lr)
print(confusion_matrix(y_pred_lr,y_test))

0.9862043795620438
[[7060   80]
 [ 109 6451]]


## RANDOM FOREST

In [57]:
from sklearn.ensemble import RandomForestClassifier
model_rfc=RandomForestClassifier()

model_rfc.fit(x_train,y_train)

y_pred_rfc=model_rfc.predict(x_test)

In [58]:
score_rfc=accuracy_score(y_pred_rfc,y_test)
print(score_rfc)
print(confusion_matrix(y_pred_rfc,y_test))

0.9906569343065693
[[7105   64]
 [  64 6467]]


## GRADIENT BOOST CLASSIFIER

In [59]:
from sklearn.ensemble import GradientBoostingClassifier
model_gbc=GradientBoostingClassifier(n_estimators=50)

model_gbc.fit(x_train,y_train)

y_pred_gbc=model_gbc.predict(x_test)

In [None]:
score_gbc=accuracy_score(y_pred_gbc,y_test)
print(score_gbc)
confusion_matrix(y_pred_gbc,y_test)

NameError: name 'y_pred_gbc' is not defined