# Fake news classifier using Natural language processing And Machine Learning

In [1]:
import pandas as pd


## Source of data : https://www.kaggle.com/c/fake-news/data

In [2]:

data=pd.read_csv("train.csv")

In [3]:
data.head()


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
data.shape

(20800, 5)

## Removing rows with nan values

In [7]:

data=data.dropna()
data.shape

(18285, 5)

## Again resetting indexes of rows

In [8]:

data.reset_index(inplace=True)


In [9]:
data

Unnamed: 0,index,id,title,author,text,label
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
7,9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0
8,10,10,Obama’s Organizing for Action Partners with So...,Aaron Klein,"Organizing for Action, the activist group that...",0
9,11,11,"BBC Comedy Sketch ""Real Housewives of ISIS"" Ca...",Chris Tomlinson,The BBC produced spoof on the “Real Housewives...,0


## Taking title column of news for classification

In [10]:

x=data.title

In [11]:
x.head()


0    House Dem Aide: We Didn’t Even See Comey’s Let...
1    FLYNN: Hillary Clinton, Big Woman on Campus - ...
2                    Why the Truth Might Get You Fired
3    15 Civilians Killed In Single US Airstrike Hav...
4    Iranian woman jailed for fictional unpublished...
Name: title, dtype: object

## Label is independent variable where 1 is fake and 0 is real. 

In [12]:

y=data.label

In [13]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

## Doing text preprocessing by removing stop words and stemming the words

In [14]:

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
import re

In [15]:
ps = PorterStemmer() 
corpus=[]
for i in range(0,len(x)):
    review=re.sub('[^a-zA-Z]',' ',x[i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(j) for j in review if j not in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)


In [16]:
corpus

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri',
 'jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart',
 'beno hamon win french socialist parti presidenti nomin new york time',
 'back channel plan ukrain russia courtesi trump associ new york time',
 'obama organ action partner soro link indivis disrupt trump agenda',
 'bbc comedi sketch real housew isi caus outrag',
 'russian research discov secret nazi militari base treasur hunter arctic photo',
 'us offici see link trump russia',
 'ye paid govern troll social media blog forum websit',
 'major leagu soccer argentin find home success new york time',
 'well fargo chief abruptli step new york time',
 'anonym donor pay million releas everyon arrest dakota access pipelin',
 'fbi close hilla

## Converting text data to respectable numerical form using tfidfvectorizer

In [17]:

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)
x

<18285x13913 sparse matrix of type '<class 'numpy.float64'>'
	with 155881 stored elements in Compressed Sparse Row format>

In [18]:
#splitting of data into training and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

## Using multinomialnb classification model

In [19]:

from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [20]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df


Unnamed: 0,f1-score,precision,recall,support
0,0.878049,0.802909,0.968704,3419.0
1,0.79673,0.943979,0.68922,2616.0
accuracy,0.847556,0.847556,0.847556,0.847556
macro avg,0.837389,0.873444,0.828962,6035.0
weighted avg,0.842799,0.864059,0.847556,6035.0


## Hyperparameter tuning on values of alpha in multinomialnb to find best model

In [21]:

from sklearn.metrics import f1_score
import numpy as np
for i in np.arange(0,1,0.1):
    model=MultinomialNB(alpha=i)
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    f1=f1_score(y_test, y_pred, average='micro')
    from sklearn.metrics import accuracy_score
    accuracy=accuracy_score(y_test, y_pred)
    print(f"alpha = {i} , f1 score = {f1} and accuracy = {accuracy}")
    
    
    
    
    

  'setting alpha = %.1e' % _ALPHA_MIN)


alpha = 0.0 , f1 score = 0.8210439105219552 and accuracy = 0.8210439105219552
alpha = 0.1 , f1 score = 0.8574979287489645 and accuracy = 0.8574979287489644
alpha = 0.2 , f1 score = 0.8608119304059652 and accuracy = 0.8608119304059652
alpha = 0.30000000000000004 , f1 score = 0.8629660314830158 and accuracy = 0.8629660314830158
alpha = 0.4 , f1 score = 0.8606462303231152 and accuracy = 0.8606462303231152
alpha = 0.5 , f1 score = 0.8591549295774648 and accuracy = 0.8591549295774648
alpha = 0.6000000000000001 , f1 score = 0.8570008285004141 and accuracy = 0.8570008285004143
alpha = 0.7000000000000001 , f1 score = 0.8553438276719137 and accuracy = 0.8553438276719139
alpha = 0.8 , f1 score = 0.8528583264291631 and accuracy = 0.8528583264291633
alpha = 0.9 , f1 score = 0.8492129246064622 and accuracy = 0.8492129246064622


## Best multinomialnb model is at alpha = 0.3 with f1 score of 0.8629

# Now using xgboost classifier

In [22]:
params={
    "learning rate":[0.05,0.10,0.15,0.20,0.25,0.30],
    "max_depth":[3,4,5,6,8,10,12,15],
    "min_child_weight":[1,3,5,7],
    "gamma":[0.0,0.1,0.2,0.3,0.4],
    "colsample_bytree":[0.3,0.4,0.5,0.7]
}

## Doing hyperparameter tuning of xgboost classifier using randomized search cv for various values of parameters above

In [23]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost
classifier=xgboost.XGBClassifier()
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring="roc_auc",n_jobs=-1,cv=5,verbose=3)
random_search.fit(x,y)



Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.4min finished


Parameters: { learning rate } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_co...
                                           validate_parameters=None,
                                           verbosity=None),
                   iid='warn', n_iter=5, n_jobs=-1,
                   param_distributions={'co

In [24]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.4, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning rate=0.1, learning_rate=0.300000012, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
random_search.best_params_

{'min_child_weight': 1,
 'max_depth': 5,
 'learning rate': 0.1,
 'gamma': 0.4,
 'colsample_bytree': 0.7}

## Creating a new model by passing best parameters found and checking scores on it

In [32]:
final_classifier=xgboost.XGBClassifier(min_child_weight= 1,
 max_depth= 5,learning_rate = 0.1,
 gamma= 0.4,
 colsample_bytree= 0.7)
            

In [35]:
from sklearn.metrics import make_scorer, accuracy_score, f1_score
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'f1_score' : make_scorer(f1_score)}


In [38]:
from sklearn.model_selection import cross_validate
score=cross_validate(final_classifier,x,y,cv=10,scoring=scoring)

In [39]:
score

{'fit_time': array([5.09700441, 5.44300866, 6.39800024, 7.30701613, 5.67098904,
        5.44899607, 5.14900279, 5.06600118, 5.0169909 , 5.01899791]),
 'score_time': array([0.10599422, 0.0859952 , 0.13500929, 0.16898155, 0.08200216,
        0.08700013, 0.08599734, 0.09000731, 0.08300209, 0.08599734]),
 'test_accuracy': array([0.92349727, 0.89885183, 0.92236195, 0.92400219, 0.91411379,
        0.91684902, 0.91411379, 0.89715536, 0.91301969, 0.9059081 ]),
 'test_f1_score': array([0.91745283, 0.89494605, 0.91695906, 0.91932676, 0.90940565,
        0.91152503, 0.90866783, 0.89244851, 0.90803933, 0.90103567])}

In [43]:
score['test_accuracy'].mean()

0.9129872970185708

In [42]:
score['test_f1_score'].mean()

0.9079806732590056

## Got the f1 score of 0.9079 by using xgboost classifier