In [19]:
import nltk

In [20]:
#nltk.download()

In [21]:
import pandas as pd

In [22]:
fake = pd.read_csv('Fake.csv')
genuine = pd.read_csv('True.csv')

In [23]:
display(fake.subject.value_counts())

News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: subject, dtype: int64

In [24]:
fake['target'] = 0
genuine['target'] = 1

In [25]:
data = pd.concat([fake, genuine], axis = 0) #concat fake and genuine

In [26]:
data = data.reset_index(drop = True) # reset index

In [27]:
data = data.drop(['subject', 'date', 'title'], axis = 1)

# Tokenization

In [28]:
from nltk.tokenize import word_tokenize

In [29]:
data['text'] = data['text'].apply(word_tokenize)

# Stemming

In [30]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english")

In [31]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [32]:
data['text'] = data['text'].apply(stem_it)

# Stopword removal

In [33]:
def stop_it(t):
    dt = [word for word in t if len(word)>2]
    return dt

In [34]:
data['text'] = data['text'].apply(stop_it)

In [35]:
print(data.head(10))

                                                text  target
0  [donald, trump, just, couldn, wish, all, ameri...       0
1  [hous, intellig, committe, chairman, devin, nu...       0
2  [friday, was, reveal, that, former, milwauke, ...       0
3  [christma, day, donald, trump, announc, that, ...       0
4  [pope, franci, use, his, annual, christma, day...       0
5  [the, number, case, cop, brutal, and, kill, pe...       0
6  [donald, trump, spent, good, portion, his, day...       0
7  [the, wake, yet, anoth, court, decis, that, de...       0
8  [mani, peopl, have, rais, the, alarm, regard, ...       0
9  [just, when, you, might, have, thought, get, b...       0


In [36]:
data['text'] = data['text'].apply(' '.join)

# Splitting

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
x_train, x_test, y_train, y_test = train_test_split(data['text'], data['target'])

# Vectorization

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
my_tfidf = TfidfVectorizer(max_df = 0.7)

In [41]:
tfidf_train = my_tfidf.fit_transform(x_train)
tfidf_test = my_tfidf.transform(x_test)

In [42]:
print(tfidf_train)

  (0, 17791)	0.010783769067692022
  (0, 26078)	0.034521580346453855
  (0, 39808)	0.012654413973152293
  (0, 68259)	0.025367961781028403
  (0, 41129)	0.017360152155696064
  (0, 44477)	0.023808603114881952
  (0, 82316)	0.011449886374490067
  (0, 86893)	0.021863099272296738
  (0, 65672)	0.019468408832810615
  (0, 10694)	0.10111534276837832
  (0, 53750)	0.04453819227928012
  (0, 87944)	0.012667690954218061
  (0, 25423)	0.03704487264246641
  (0, 69056)	0.0176779376739509
  (0, 37558)	0.05480518637747805
  (0, 74042)	0.04640370491278206
  (0, 67756)	0.033272910444685985
  (0, 78055)	0.015134274809446728
  (0, 3519)	0.0291474540464965
  (0, 71267)	0.015646552352832917
  (0, 74000)	0.016134951170604143
  (0, 7561)	0.013571539638938163
  (0, 64622)	0.021757326769625492
  (0, 64859)	0.02184371541008069
  (0, 66152)	0.026331459958566467
  :	:
  (33672, 48348)	0.018808170254053624
  (33672, 67147)	0.012887035772915908
  (33672, 22365)	0.00947132530272064
  (33672, 89832)	0.018035086137596277
  (33

# Logistic regression

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [44]:
model_1 = LogisticRegression(max_iter=900)
model_1.fit(tfidf_train, y_train)
pred_1 = model_1.predict(tfidf_test)
cr1 = accuracy_score(y_test,pred_1)
print(cr1*100)

98.8596881959911


# PassiveAggressiveClassifier

In [45]:
from sklearn.linear_model import PassiveAggressiveClassifier
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [46]:
y_pred = model.predict(tfidf_test)
accscore = accuracy_score(y_test, y_pred)
print('The accuracy of prediction is ', accscore*100)

The accuracy of prediction is  99.6792873051225
