# Import important libraries

In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

We are going to read the datasets to get a brief understanding of the data we are working with

In [2]:
raw_test_y = pd.read_csv('Datasets/submit.csv')
raw_test_X = pd.read_csv('Datasets/test.csv')
train_df = pd.read_csv('Datasets/train.csv')

In [3]:
raw_test_y.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1


In [4]:
raw_test_y.shape

(5200, 2)

In [5]:
raw_test_y.dtypes

id       int64
label    int64
dtype: object

In [6]:
raw_test_X.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [7]:
raw_test_X.shape

(5200, 4)

In [8]:
raw_test_X.dtypes

id         int64
title     object
author    object
text      object
dtype: object

In [9]:
train_df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [10]:
train_df.shape

(20800, 5)

In [11]:
train_df.dtypes

id         int64
title     object
author    object
text      object
label      int64
dtype: object

## Data Preprocessing

In [12]:
# merge the data
merged_test = raw_test_X.merge(raw_test_y, on='id')
print(merged_test.shape)
merged_df = train_df.append(merged_test)
print(merged_df.shape)

(5200, 5)
(26000, 5)


In [13]:
# get more info about the merged_df
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26000 entries, 0 to 5199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      26000 non-null  int64 
 1   title   25320 non-null  object
 2   author  23540 non-null  object
 3   text    25954 non-null  object
 4   label   26000 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.2+ MB


In [14]:
# take a look at the text column as we will use it for our feature column
merged_df['text'].head()

0    House Dem Aide: We Didn’t Even See Comey’s Let...
1    Ever get the feeling your life circles the rou...
2    Why the Truth Might Get You Fired October 29, ...
3    Videos 15 Civilians Killed In Single US Airstr...
4    Print \nAn Iranian woman has been sentenced to...
Name: text, dtype: object

In [15]:
# check if there's any null values in the text column 
merged_df.isnull().sum()

id           0
title      680
author    2460
text        46
label        0
dtype: int64

In [16]:
# drop rows that contain null values in the text column
merged_df.dropna(subset=['text'], inplace=True)
merged_df.isnull().sum()

id           0
title      680
author    2414
text         0
label        0
dtype: int64

In [17]:
# rearrange the index 
merged_df = merged_df.reset_index()

In [18]:
# Split the dataset 
X_train, X_test, y_train, y_test = train_test_split(merged_df['text'], merged_df['label'], test_size = 0.2)

In [19]:
# create TfidfVectorizer object
vectorizer = TfidfVectorizer(stop_words='english')

# generate matrix of word vectors
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)

In [20]:
# create Naive Bayes classifier object
clf = MultinomialNB()
clf.fit(tfidf_train, y_train)

# predict on the test set
pred = clf.predict(tfidf_test)
# calcuate the accuracy
score = accuracy_score(y_test, pred)
print('Accuracy score: {:.2f}%'.format(score *100))

Accuracy score: 78.89%
