In [3]:
# Import the neccesary packages
import pandas as pd
import numpy as np

In [4]:
# Ignore the warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
data = pd.read_csv('news_dataset.csv')

In [6]:
data.head()

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...


In [7]:
data.columns

Index(['label', 'text'], dtype='object')

In [8]:
data.info

<bound method DataFrame.info of      label                                               text
0     REAL  Payal has accused filmmaker Anurag Kashyap of ...
1     FAKE  A four-minute-long video of a woman criticisin...
2     FAKE  Republic Poll, a fake Twitter account imitatin...
3     REAL  Delhi teen finds place on UN green list, turns...
4     REAL  Delhi: A high-level meeting underway at reside...
...    ...                                                ...
3724  REAL  19:17 (IST) Sep 20\n\nThe second round of coun...
3725  REAL  19:17 (IST) Sep 20\n\nThe second round of coun...
3726  FAKE  The Bengaluru City Police’s official Twitter h...
3727  REAL  Sep 20, 2020, 08:00AM IST\n\nSource: TOI.in\n\...
3728  REAL  Read Also\n\nRead Also\n\nAdvocate Ishkaran Bh...

[3729 rows x 2 columns]>

In [12]:
data['label'].describe()

count     3729
unique       2
top       FAKE
freq      1877
Name: label, dtype: object

In [13]:
data['text'].describe()

count                                                  3721
unique                                                 2229
top       Highest IPL score in Dubai\n\nMilestones to wa...
freq                                                     79
Name: text, dtype: object

In [17]:
df = data.dropna()
df.shape

(3721, 2)

In [18]:
df['label'].unique()

array(['REAL', 'FAKE'], dtype=object)

In [19]:
df.label.value_counts()

FAKE    1871
REAL    1850
Name: label, dtype: int64

In [20]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [21]:
# remove whitespaces
df['text']=df['text'].str.strip()
# lowercase the text
df['text'] = df['text'].str.lower()
#remove punctuation
punc = string.punctuation
table = str.maketrans('','',punc)
df['text']=df['text'].apply(lambda x: x.translate(table))
# tokenizing each message
df['text']=df.apply(lambda x: x['text'].split(' '),axis=1)
# removing stopwords
df['text'] = df.apply(lambda x: [word for word in x['text'] if word not in stopwords.words('english')],axis=1)
# stemming
ps = PorterStemmer()
df['text']= df.apply(lambda x: [ps.stem(word) for word in x['text']],axis=1)
# remove single letter words
df['text'] = df.apply(lambda x: ' '.join([word for word in x['text'] if len(word)>1]),axis=1)


In [22]:
X = np.array(df["text"])
y = np.array(df["label"])

In [23]:
y[0:10]

array(['REAL', 'FAKE', 'FAKE', 'REAL', 'REAL', 'REAL', 'FAKE', 'FAKE',
       'FAKE', 'FAKE'], dtype=object)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv_X = cv.fit_transform(X) # Fit the Data

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cv_X, y, test_size=0.2, random_state=42)

In [30]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)

In [32]:
PredictDT = DT.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score
print('Accuracy of Decision Tree Classifier',accuracy_score(y_test,PredictDT)*100)

Accuracy of Decision Tree Classifier 99.5973154362416


In [34]:
from sklearn.metrics import confusion_matrix
print('Confusuion matrix of Decision Tree Classifier\n',confusion_matrix(y_test,PredictDT))

Confusuion matrix of Decision Tree Classifier
 [[361   0]
 [  3 381]]


In [35]:
from sklearn.metrics import classification_report
print('Classification report of Decision Tree Classifier\n\n',classification_report(y_test,PredictDT))

Classification report of Decision Tree Classifier

               precision    recall  f1-score   support

        FAKE       0.99      1.00      1.00       361
        REAL       1.00      0.99      1.00       384

    accuracy                           1.00       745
   macro avg       1.00      1.00      1.00       745
weighted avg       1.00      1.00      1.00       745



In [37]:
import joblib
joblib.dump(DT, 'Dt.pkl')
joblib.dump(cv, 'cv_2.pkl')

['cv_2.pkl']