# Importing libraries and dataset

In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
train = pd.read_csv('/kaggle/input/fake-news/train.csv')
train.sample(2)

Unnamed: 0,id,title,author,text,label
7895,7895,‘We Are Orphans Here’ - The New York Times,Rachel Kushner,Standing at an intersection in Shuafat Refugee...,0
12448,12448,North Dakota had 292 oil spills in 2 years off...,Quest,We Are Change \nNorth Dakota had nearly 300 oi...,1


In [8]:
test = pd.read_csv('/kaggle/input/fake-news/test (3).csv')
test.sample(2)

Unnamed: 0,id,title,author,text
3541,24341,Anonymity: The Greatest Weapon Against Oppression,ActivistPost,By Learn Liberty “Anonymity is a shield from t...
543,21343,Wikileaks’ Julian Assange at Embassy Balcony: ...,Oliver JJ Lane,Wikileaks founder Julian Assange appeared on t...


# Preprecessing

In [9]:
train.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [10]:
train.dropna(inplace=True)
train.reset_index(inplace=True)

In [11]:
train.isnull().sum()

index     0
id        0
title     0
author    0
text      0
label     0
dtype: int64

In [12]:
train.shape

(18285, 6)

In [13]:
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [14]:
test.shape

(5200, 4)

In [15]:
test.dropna(inplace=True)
test.reset_index(inplace=True)

In [16]:
test.isnull().sum()

index     0
id        0
title     0
author    0
text      0
dtype: int64

In [17]:
test.shape

(4575, 5)

In [18]:
x = train.title+' '+train.author
x[:2]

0    House Dem Aide: We Didn’t Even See Comey’s Let...
1    FLYNN: Hillary Clinton, Big Woman on Campus - ...
dtype: object

In [19]:
y = train.label
y[:5]

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [20]:
# applying stemming 

In [21]:
ps = PorterStemmer()

In [22]:
corpus=[]
for i in range(len(x)):
    r = x[i]
    r = re.sub('[^a-zA-Z]',' ',r)
    r = r.lower()
    r = r.split()
    r = [ps.stem(w) for w in r if not w in stopwords.words('english')]
    r = ' '.join(r)
    corpus.append(r)

In [23]:
x = corpus

In [24]:
vect = TfidfVectorizer()

In [25]:
x = vect.fit_transform(x)

In [26]:
x.shape

(18285, 15960)

In [27]:
xtr, xte, ytr, yte = tts(x,y,test_size=0.2, random_state=0)

# Modeling

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
cls = MultinomialNB()
cls.fit(xtr, ytr)

MultinomialNB()

In [29]:
yp = cls.predict(xte)
print(accuracy_score(yte, yp))

0.9540607054963085


In [30]:
from lightgbm import LGBMClassifier
params = {'learning_rate':.08, 'n_estimators':440}
model = LGBMClassifier(**params)
model.fit(xtr, ytr, eval_set=(xte, yte))
model.score(xte, yte)

[1]	valid_0's binary_logloss: 0.616845
[2]	valid_0's binary_logloss: 0.557317
[3]	valid_0's binary_logloss: 0.506205
[4]	valid_0's binary_logloss: 0.462063
[5]	valid_0's binary_logloss: 0.423452
[6]	valid_0's binary_logloss: 0.389592
[7]	valid_0's binary_logloss: 0.359329
[8]	valid_0's binary_logloss: 0.33289
[9]	valid_0's binary_logloss: 0.309068
[10]	valid_0's binary_logloss: 0.287834
[11]	valid_0's binary_logloss: 0.268792
[12]	valid_0's binary_logloss: 0.251587
[13]	valid_0's binary_logloss: 0.236106
[14]	valid_0's binary_logloss: 0.222126
[15]	valid_0's binary_logloss: 0.209636
[16]	valid_0's binary_logloss: 0.197855
[17]	valid_0's binary_logloss: 0.187009
[18]	valid_0's binary_logloss: 0.177592
[19]	valid_0's binary_logloss: 0.168397
[20]	valid_0's binary_logloss: 0.160101
[21]	valid_0's binary_logloss: 0.152623
[22]	valid_0's binary_logloss: 0.145679
[23]	valid_0's binary_logloss: 0.139349
[24]	valid_0's binary_logloss: 0.133576
[25]	valid_0's binary_logloss: 0.12839
[26]	valid_

0.9890620727372162

In [31]:
yp_ = model.predict(xte)
print(accuracy_score(yte, yp_))

0.9890620727372162
