In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
 # to convert text into numbers
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [4]:
df=pd.read_csv('/kaggle/input/fake-news-classification/WELFake_Dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [5]:
df.shape

(72134, 4)

In [6]:
df.isna().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [7]:
df=df.fillna("")
df.isna().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [8]:
porterstemmer=PorterStemmer() #it is used to get root word from word like actor, acting, actress has act as a root word

In [9]:
def stemming(content):
    stemmed=re.sub('[^a-zA-Z]', ' ', content)
    stemmed=stemmed.lower()
    stemmed=stemmed.split()
    stemmed=[porterstemmer.stem(word) for word in stemmed if not word in stopwords.words('english')]
    stemmed=' '.join(stemmed)
    return stemmed

In [10]:
print(df['title'])
df['title']=df['title'].apply(stemming)


0        LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1                                                         
2        UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3        Bobby Jindal, raised Hindu, uses story of Chri...
4        SATAN 2: Russia unvelis an image of its terrif...
                               ...                        
72129    Russians steal research on Trump in hack of U....
72130     WATCH: Giuliani Demands That Democrats Apolog...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    Trump tussle gives unpopular Mexican leader mu...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: title, Length: 72134, dtype: object


In [11]:
print(df['title'])

0        law enforc high alert follow threat cop white ...
1                                                         
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rais hindu use stori christian co...
4        satan russia unv imag terrifi new supernuk wes...
                               ...                        
72129    russian steal research trump hack u democrat p...
72130    watch giuliani demand democrat apolog trump ra...
72131         migrant refus leav train refuge camp hungari
72132    trump tussl give unpopular mexican leader much...
72133           goldman sach endors hillari clinton presid
Name: title, Length: 72134, dtype: object


In [12]:
X=df['title'].values
y=df['label'].values

In [13]:
print(X)

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video'
 ''
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video'
 ... 'migrant refus leav train refuge camp hungari'
 'trump tussl give unpopular mexican leader much need shot arm'
 'goldman sach endors hillari clinton presid']


In [14]:
print(y)

[1 1 1 ... 0 0 1]


In [15]:
# converting textual data to numerical data
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 629110 stored elements and shape (72134, 19639)>
  Coords	Values
  (0, 19106)	0.19134939529376566
  (0, 18648)	0.1297506867782943
  (0, 17363)	0.2542650376115143
  (0, 17260)	0.24871262252022117
  (0, 9699)	0.22829788917209384
  (0, 7887)	0.26746434949988324
  (0, 6730)	0.48553136502134386
  (0, 6425)	0.28932771754845743
  (0, 5509)	0.31820565801047196
  (0, 3679)	0.24871262252022117
  (0, 1802)	0.33473541566384035
  (0, 407)	0.3190180925014663
  (2, 18648)	0.13443733492985524
  (2, 18034)	0.35962437110547785
  (2, 16446)	0.1999703023632961
  (2, 15094)	0.1609967301122813
  (2, 14591)	0.3580030298678158
  (2, 13591)	0.22687620695463123
  (2, 12744)	0.27904818164471595
  (2, 12011)	0.16878852994653004
  (2, 11864)	0.2231406266784195
  (2, 8020)	0.2692285294185893
  (2, 6880)	0.2652283770602196
  (2, 2919)	0.3639616996972358
  (2, 2673)	0.30809679188606154
  :	:
  (72130, 1768)	0.49293214478810593
  (72130, 764)	0.398703804077

In [16]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2, stratify=y, random_state=2)


In [17]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 503269 stored elements and shape (57707, 19639)>
  Coords	Values
  (0, 18192)	0.447071135845193
  (0, 14186)	0.2652578256230375
  (0, 14023)	0.3145375300355713
  (0, 13898)	0.3979061365475474
  (0, 12893)	0.3480857937521153
  (0, 12011)	0.16299504420057034
  (0, 10005)	0.23533394616585862
  (0, 3346)	0.38847412442053003
  (0, 2926)	0.3442051979974664
  (1, 18648)	0.1403838185760879
  (1, 16183)	0.33894761320836764
  (1, 16096)	0.3634731576463672
  (1, 16004)	0.3608974162987037
  (1, 14917)	0.34052388367563446
  (1, 6710)	0.3676312332487335
  (1, 5761)	0.3321236803309108
  (1, 5356)	0.20150078450075734
  (1, 4436)	0.39449738271762325
  (1, 4419)	0.21307531153632
  (2, 18648)	0.11024869560892073
  (2, 14411)	0.24927921406648687
  (2, 13198)	0.18260462563665714
  (2, 13004)	0.2977357427141644
  (2, 12002)	0.3317426331959291
  (2, 10591)	0.2310569200556685
  :	:
  (57704, 14349)	0.22002502411501668
  (57704, 12199)	0.25790155719

In [18]:
model=LogisticRegression()


In [19]:
model.fit(X_train,y_train)

In [20]:
X_train_pred=model.predict(X_train)
X_train_acc=accuracy_score(X_train_pred, y_train)

In [21]:
print('acc of training data: ',X_train_acc)

acc of training data:  0.9199403885143916


In [22]:
scores = cross_val_score(model, X,y,  cv=5)
scores

array([0.89921675, 0.89734526, 0.90205864, 0.89769183, 0.89664495])

In [23]:
X_test_pred=model.predict(X_test)
test_data_acc=accuracy_score(X_test_pred,y_test)
test_data_acc

0.900603035974215

In [24]:
X_new=X_test[2121]
prediction=model.predict(X_new)
if(prediction[0]==1):
    print('The news is Fake!')
else:
    print('The News is Real.')

The News is Real.


In [25]:
print(y_test[2121])

0
