In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
real = pd.read_csv('./data/True.csv')
real['target'] = np.zeros(real.shape[0])
fake = pd.read_csv('./data/Fake.csv')
fake['target'] = np.ones(fake.shape[0])

df = real.append(fake, ignore_index=True)

In [51]:
df = df.drop(['subject', 'date'], axis=1)
df.head()

Unnamed: 0,title,text,target
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,0.0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,0.0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,0.0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,0.0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,0.0


In [52]:
train = df.copy()
train.head()

Unnamed: 0,title,text,target
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,0.0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,0.0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,0.0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,0.0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,0.0


In [53]:
stop_words = stopwords.words("english")

In [54]:
train['title'] = train['title'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
train['text'] = train['text'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
train.head()

Unnamed: 0,title,text,target
0,"as u.s. budget fight looms, republicans flip t...",washington (reuters) - the head of a conservat...,0.0
1,u.s. military to accept transgender recruits o...,washington (reuters) - transgender people will...,0.0
2,senior u.s. republican senator: 'let mr. muell...,washington (reuters) - the special counsel inv...,0.0
3,fbi russia probe helped by australian diplomat...,washington (reuters) - trump campaign adviser ...,0.0
4,trump wants postal service to charge 'much mor...,seattle/washington (reuters) - president donal...,0.0


In [None]:
train['title'] = train['title'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
train['text'] = train['text'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
train.head()

In [None]:
!pip install textblob

In [55]:
from textblob import Word
from nltk.corpus import wordnet
train['title'] = train['title'].apply(lambda x: ' '.join(Word(word).lemmatize() for word in x.split()))
train['text'] = train['text'].apply(lambda x: ' '.join(Word(word).lemmatize() for word in x.split()))
train.head()

Unnamed: 0,title,text,target
0,"a u.s. budget fight looms, republican flip the...",washington (reuters) - the head of a conservat...,0.0
1,u.s. military to accept transgender recruit on...,washington (reuters) - transgender people will...,0.0
2,senior u.s. republican senator: 'let mr. muell...,washington (reuters) - the special counsel inv...,0.0
3,fbi russia probe helped by australian diplomat...,washington (reuters) - trump campaign adviser ...,0.0
4,trump want postal service to charge 'much more...,seattle/washington (reuters) - president donal...,0.0


In [56]:
train1 = train[['text', 'target']]

In [57]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train1['text'], train1['target'], test_size=0.2, random_state=42)

In [58]:
tfidf = TfidfVectorizer(max_features=100, lowercase=True, analyzer='word', stop_words= 'english',ngram_range=(1,1))
tfidf.fit(train['text'])

x_t = tfidf.transform(x_train)
x_v = tfidf.transform(x_valid)
x_t

<35918x100 sparse matrix of type '<class 'numpy.float64'>'
	with 824618 stored elements in Compressed Sparse Row format>

In [63]:
def model_training(model, x_t, y_t, x_v, y_v):
    model.fit(x_t, y_t)
    print('Training Score', model.score(x_t, y_t))
    

    print('Validation Score', model.score(x_v, y_v))
    print('F1 Score', f1_score(model.predict(x_v), y_v))
    print('Confusion Matrix\n', confusion_matrix(y_v, model.predict(x_v)))
    print('Validation ROC_AUC_SCORE', roc_auc_score(y_v,model.predict_proba(x_v)[::,-1]), '\n')
    
    return model

In [38]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.0.2-py3-none-win_amd64.whl (24.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.0.2


In [62]:
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

xgboost = XGBClassifier(n_estimators=500, max_depth=5,learning_rate=0.1,scale_pos_weight=1.4266790777602751)
lgr = LogisticRegression(n_jobs=1)

print('XGBoost')
xgboost = model_training(xgboost, x_t, y_train, x_v, y_valid)
print('Logistic Regression')
lgr = model_training(lgr, x_t, y_train, x_v, y_valid)

XGBoost
Training Score 0.9999721588061696
Validation Score 0.9962138084632517
F1 Score 0.9963393626184324
Confusion Matrix [[4319   11]
 [  23 4627]]
Validation ROC_AUC_SCORE 0.999429337703941 

Logistic Regression
Training Score 0.9821537947547191
Validation Score 0.9819599109131403
F1 Score 0.9825242718446602
Confusion Matrix [[4264   66]
 [  96 4554]]
Validation ROC_AUC_SCORE 0.9971262757952767 



# Summary
So we can see that XGBoost performs better though it takes higher time for training the model, Logistic Regression does almost equally good and falls behind XGBoost in F1 score by 1% but training time is much lesser...

In [67]:
def fake_or_not(model, text):
    data = ' '.join([Word(x.lower()).lemmatize() for x in text.split()])
    data = tfidf.transform([data])
    prediction = model.predict(data)
    return prediction

# Improvements
Maybe better metrics can be obtained by combining the title and the text instead of completely ignoring the title, another improvement that could be made is using Gensism