### Identifying Unreliable news articles

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import confusion_matrix

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
df.shape

(20800, 5)

In [4]:
df.isnull().any()

id        False
title      True
author     True
text       True
label     False
dtype: bool

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [6]:
# drop nulls
df.dropna(inplace=True)
df.reset_index(inplace = True)
df.shape

(18285, 6)

In [7]:
titles = df['title']
y = df['label']

In [8]:
# preprocess text
ps = PorterStemmer()
corpus = []

for i in range(0, len(titles)):
    title = re.sub('[^a-zA-Z]', ' ', titles[i])
    title = title.lower()
    title = title.split()
    title = [ps.stem(word) for word in title if not word in stopwords.words('english')]
    title = ' '.join(title)
    corpus.append(title)
    

In [9]:
corpus[:5]

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri']

In [10]:
# creating the BOW 
countvector = CountVectorizer(max_features=5000, ngram_range=(1, 3))
X = countvector.fit_transform(corpus).toarray()

In [11]:
X.shape

(18285, 5000)

In [20]:
countvector.get_feature_names()[:10] # features from BOW

['abandon',
 'abc',
 'abc news',
 'abduct',
 'abe',
 'abedin',
 'abl',
 'abort',
 'abroad',
 'absolut']

In [18]:
countvector.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((14628, 5000), (3657, 5000))

In [28]:
### Passive aggresive classifier

model2 = PassiveAggressiveClassifier(max_iter=50)
model2.fit(X_train, y_train)
model2.score(X_test, y_test)

0.9108558928083128

In [29]:
y_pred = model2.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[1898  184]
 [ 142 1433]]


In [24]:
### Multinomial Naive bayes

model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [25]:
model.score(X_test, y_test)

0.8944490019141372

In [27]:
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[1851  231]
 [ 155 1420]]


In [36]:
# hyperparameter tuning for alpha
model = MultinomialNB(alpha=0.1)
pre_score = 0

for alpha in np.arange(0,1,0.1):
    sub_model = MultinomialNB(alpha=alpha)
    sub_model.fit(X_train, y_train)
    
    score = sub_model.score(X_test, y_test)
    if score > pre_score:
        model = sub_model
        pre_score = score
    print("alpha : {}, score: {}".format(alpha, score))



alpha : 0.0, score: 0.8895269346458846
alpha : 0.1, score: 0.8955427946404156
alpha : 0.2, score: 0.8969100355482637
alpha : 0.30000000000000004, score: 0.8969100355482637
alpha : 0.4, score: 0.8955427946404156
alpha : 0.5, score: 0.896636587366694
alpha : 0.6000000000000001, score: 0.8969100355482637
alpha : 0.7000000000000001, score: 0.8960896910035548
alpha : 0.8, score: 0.8955427946404156
alpha : 0.9, score: 0.895269346458846


In [37]:
model.score(X_test, y_test) # best score 

0.8969100355482637

In [38]:
model # best alpha

MultinomialNB(alpha=0.2)

In [42]:
###### see most fake and most real words

feature_names = countvector.get_feature_names()

In [46]:
model.coef_

array([[ -8.96067895,  -8.68109409,  -9.35014372, ..., -12.39466616,
         -8.68109409,  -9.62207744]])

In [55]:
# most fake
sorted(zip(model.coef_[0], feature_names))[:10]   # more negetaive means 'more fake'

[(-12.394666159084977, 'abroad'),
 (-12.394666159084977, 'abus new'),
 (-12.394666159084977, 'abus new york'),
 (-12.394666159084977, 'act new'),
 (-12.394666159084977, 'act new york'),
 (-12.394666159084977, 'adopt'),
 (-12.394666159084977, 'advic'),
 (-12.394666159084977, 'advis new'),
 (-12.394666159084977, 'advis new york'),
 (-12.394666159084977, 'age new')]

In [56]:
# most real
sorted(zip(model.coef_[0], feature_names), reverse=True)[:10] 

[(-3.881280206011694, 'trump'),
 (-4.1960267037876084, 'hillari'),
 (-4.2643126116537395, 'clinton'),
 (-4.768583401012597, 'elect'),
 (-5.111217930328347, 'new'),
 (-5.1497246127479706, 'comment'),
 (-5.171370479522664, 'video'),
 (-5.275030521067342, 'war'),
 (-5.2872406849742735, 'hillari clinton'),
 (-5.316324579527307, 'us')]