In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Using Fake and Real news datasets
import pandas as pd
import re
data_true =pd.read_csv("../input/fake-and-real-news-dataset/True.csv")
data_fake =pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")

# **Personal project:** Create a Fake News detector

I'm going to use the two sets of fake-and-real-news-dataset to train models and detect fake new ones.

I would mainly use newspaper article headlines.

This notebook consists of 3 parts

**1. Exploring the Dataset**

**2. feature engineering and preprocessing**

**3. Classification**
> * Compare models on the trainning set
> * Compare the best models on the testing set
> 

*I hope you will enjoy reading!!*

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Upload the data sets.

In [None]:
print(data_true.shape)
print(data_true.columns)
data_true["Target"]="True"

In [None]:
data_true.head(2)

In [None]:
print(data_fake.shape)
print(data_fake.columns)
data_fake["Target"]="Fake"

In [None]:
data_fake.head(2)

In [None]:
 df=pd.concat([data_true, data_fake], ignore_index=True)
print(df.shape)
df.head()

# 1.  Exploring the data

* **"Subject" analysis**

In [None]:
fig=plt.figure(figsize=[12,9])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)

plt.title("Subjects",size=18)
sns.countplot(data=df, x="subject",hue="Target")

In [None]:
print(df["subject"].unique())

Re-encode the label of the Subjects

In [None]:
def encode_subject(label):
    if label  in ["politicsNews",'politics' ,'Government News','left-news']:
        return "politics"
    elif label  in ['worldnews' ,'News']:
        return "world news"
    else:
        return "US_News"

df["subject"]=df["subject"].apply(encode_subject) 

In [None]:
fig=plt.figure(figsize=[10,7])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)

plt.title("Subjects",size=18)
sns.countplot(data=df, x="subject",hue="Target", palette="Set3",edgecolor="black")

we remove the rows whose subject is US_News


In [None]:
df=df.loc[df["subject"]!="US_News"]

* **Titles and texts lenght  analysis**

> **Question:** Is there a link between the title length and the target variable?

In [None]:
import nltk

def count_words(title):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    new_words = tokenizer.tokenize(title)
    return len(new_words)
df["n_words in title"]=df["title"].apply(count_words)


In [None]:
fig=plt.figure(figsize=[8,5])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)

plt.title("Number of words in the title.",size=18)
sns.boxplot(data=df, x="Target",y="n_words in title",showfliers=False,width=0.4,color="#a5acce")

The fake publications seems to have  longer titles than the true ones.

Same approach with the all **text**

In [None]:
df["n_words in text"]=df["text"].apply(count_words)

In [None]:
fig=plt.figure(figsize=[8,5])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)

plt.title("Number of words in the text.",size=18)
sns.boxplot(data=df, x="Target",y="n_words in text",showfliers=False,width=0.4,color="#a5acce")
plt.grid()

# 2. Feature engineering

In this section i'll be worked with a 6000 rows sample.
 
We extract feature form the titles by using **TfidfVectorizer** .We only fit this algorithme on the trainning set, thus the test set is not used to build the model or the preprocessing methods.

we extract a large number of variables, so we apply a dimensional reduction with **PCA**

Finnaly we encode the subject with **.get_dummies()**

In [None]:
def get_sample(N,df):
    n =df.shape[0]
    p =N/n
    sample=df.sample(frac=p, replace=True)
    print("A {} rows sample as been extracted.".format(N))
    return sample
sample=get_sample(6000,df)

In [None]:
sample.head(2)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


y=sample["Target"]
X=sample.drop(columns=["Target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

the following function returns the cleaned text corpus. remove punctuation and stopwords

In [None]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


def a_number(word):
    for letter in word:
        if str(letter)in ["0","1","2","3","4","5","6","7","8","9"]:
            return True
def not_a_word(word):
    if len(word)==1:
        return True
    elif a_number(word):
        return True
    else:
        return False

def extract_corpus(col,df):
    snowstem = SnowballStemmer(language="english")
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    stop_words = set(stopwords.words("english"))
    
    stop_word2=["for","is","a","of","no","not","he","she",
                "this","on","it","to","in","at","is", "or",
                "in","not","by","if","in"]
    stop_words= stop_words.union(stop_word2)
    corpus = []
    text_list=df[col].values
    
    for text in text_list:
        text=tokenizer.tokenize(text)
        review = [snowstem.stem(word.lower()) for word in text if not word in stop_words]
        review=[word for word in review if not_a_word(word)==False]
        review = ' '.join(review)
        
        corpus.append(review)
    return corpus




the function determines the number of principal components to generate to keep  **60% of the explained variance** 

Returns a PCA instance with de good number of PC.

In [None]:
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def nbr_of_pca(features):
    scaler=StandardScaler() 
    Xs=scaler.fit_transform(features)
    

    # apply PCA
    pca = decomposition.PCA(n_components=min(features.shape[1],
                                             features.shape[0])).fit(Xs)
    nbr_pca=0
    scree = pca.explained_variance_ratio_
    for i in range(features.shape[1]):
        a = scree.cumsum()[i]
        if a >= 0.6:
            print("{} principal components explaines  60% of the total variance".format(i))
            print("Sum of variance explained :{}%".format(round(a*100,2)))
            nbr_pca=i
            break
    pca = decomposition.PCA(n_components=nbr_pca)
    
    return pca,nbr_pca

This function is **preprocessing the train** Set and returns the **pca and the Vectorizer** fitted on this set + the processed train set

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

def preprocess_train_set(train):
    ref=train[["subject","n_words in title","n_words in text"]]
    
    #create corpus to fit vectorizer
    corpus=extract_corpus("title",train)
    vectorizer = TfidfVectorizer()
    features = vectorizer.fit_transform(corpus)
    #features to pandas data frame
    features=pd.DataFrame(features.todense())
    
    #Apply PCA with the right number of component to keep 
    #60% of de explained variance
    scaler=StandardScaler() 
    Xs=scaler.fit_transform(features)
    pca,N=nbr_of_pca(features)
    d=pca.fit_transform(Xs)
    
    d=pd.DataFrame(d, columns=["Title PCA n°{}".format(i+1) for i in range(0,N) ])
    for i in d.columns:
        ref[i]=d[i].values
    ref=pd.get_dummies(data=ref, columns=["subject"])
    #Return the pca and vectoriver fitted ,
    # + the number of principal component and the data preprocessed
    return pca,N,vectorizer, ref
    

This function take the **pca and vectorizer** already fitted on the train set and returns de processed testing set .

In [None]:
def preprocess_test(vectorizer,pca,N,test):
    ref=test[["subject","n_words in title","n_words in text"]]
    corpus=extract_corpus("title",test)
    #Vectorizer already fitted on the train set 
    features = vectorizer.transform(corpus)
    features=pd.DataFrame(features.todense())
    
    scaler=StandardScaler() 
    Xs=scaler.fit_transform(features)
    #PCA already fitted on the train set
    d=pca.transform(Xs)
    
    d=pd.DataFrame(d, columns=["Title PCA n°{}".format(i+1) for i in range(0,N) ])
    for i in d.columns:
        ref[i]=d[i].values
    ref=pd.get_dummies(data=ref, columns=["subject"])
    return ref

In [None]:
pca,N,vectorizer, Xtrain=preprocess_train_set(X_train)

In [None]:

Xtest=preprocess_test(vectorizer,pca,N,X_test)

In [None]:
print("Trainning set:")
print(Xtrain.shape[0],'Rows',Xtrain.shape[1],"columns")

In [None]:
print("Testing set:")
print(Xtest.shape[0],'Rows',Xtest.shape[1],"columns")

#  3.  Classification .

 # 3.1 Compare model on the trainning set

the following function tests the models on the trainning set and returns the results as a dictionary.
We apply a cross validation with 10 splits

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

def lets_try(train, y):
    results = {}
    ss=StandardScaler()
    scaled_train=ss.fit_transform(train)
    
   
    def test_model(clf):
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        scores = cross_val_score(clf, train, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
        return scores

    #for the model which needed standardized data 
    def test_model_scaler(clf):
    
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
        scores = cross_val_score(clf, scaled_train, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
        return scores
    
    clf = SVC(kernel="linear")
    results["SVC"] = test_model_scaler(clf)
    print("SVC done")
    
    clf = LogisticRegression()
    results["Logistic Regression"] = test_model_scaler(clf)
    print("Logistic Regression done")

    clf = KNeighborsClassifier()
    results["Kneighbors"] = test_model(clf)
    print("Kneighbors done")

    clf = SVC(kernel="poly")
    results["SVC poly"] = test_model_scaler(clf)
    print("SVC poly done.")

    clf = RandomForestClassifier()
    results["Random Forest Classifier"] = test_model(clf)
    print("Random Forest Classifier done")


    clf =SVC(kernel='rbf')
    results["SVC RBF"] = test_model_scaler(clf)
    print("SVC rbf done")

   
    return results 

In [None]:
dic_results=lets_try(Xtrain, y_train)

In [None]:
fig=plt.figure(figsize=[10,10])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("Distribution of Cross-validation score  on the trainning set. \n 10 folds",size=16)
plt.boxplot(dic_results.values(),labels=dic_results.keys(),showmeans=True)
plt.ylabel("  Scores CV \n (Accuracy)",size=14)
plt.ylim(0.4,1)
plt.xticks(rotation=90)
plt.grid()


4 models seem very efficient on the training set:

> *  **Random forest**
> * **logistic regression**
> * **SVC kernel rbf**
> * **SVC kernel linear**

let's evaluate these models on the test set

# 3.2 Compare the best models on the testing set

In [None]:
ss=StandardScaler()
scaled_test=ss.fit_transform(Xtest)
scaled_train=ss.fit_transform(Xtrain)

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
model0=RandomForestClassifier()
model0.fit(Xtrain,y_train)

fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(model0,Xtest, y_test
                           , normalize='true', cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the Random forest on the testing set",size=14)
plt.show()

In [None]:
model1=LogisticRegression()
model1.fit(scaled_train,y_train)

fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(model1,scaled_test, y_test
                           , normalize='true', cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of logistic regression on the testing set",size=14)
plt.show()

In [None]:
model=SVC(kernel='rbf')

model.fit(scaled_train,y_train) 

fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(model, scaled_test, y_test
                           , normalize='true', cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the SVC(rbf kernel) on the testing set",size=14)
plt.show()

In [None]:
model=SVC(kernel='linear')

model.fit(scaled_train,y_train) 

fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(model, scaled_test, y_test
                           , normalize='true', cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the SVC( linear) on the testing set",size=14)
plt.show()

In [None]:
def convert_y_test(test):
    y_list=[]
    for i in test:
        if i=="True":
            y_list.append(0)
        else:
            y_list.append(1)
    return pd.Series(y_list)
Y=convert_y_test(y_test)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score,auc
#reg logistic
pred_prob1 = model1.predict_proba(scaled_test)
fpr1, tpr1, thresh1 = roc_curve(Y, pred_prob1[:,1])

#random forest
pred_prob0 = model0.predict_proba(Xtest)
fpr0, tpr0, thresh0 = roc_curve(Y, pred_prob0[:,1])
auc0= auc(tpr0,fpr0 )
auc1= auc(tpr1,fpr1 )

In [None]:
fig=plt.figure(figsize=[6,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("ROC curve",size=16)
plt.plot(tpr1,fpr1,  linestyle='--', label='Logistic regression')
plt.plot(tpr0,fpr0,  marker='.', label='RandomForest')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.text(0.75,0.18,"AUC log:{} \n AUC RF:{}".format(round(auc1,3),round(auc0,3)))
plt.legend()
# show the plot
plt.show()

* **HoW large have to be de train to obtain good prediction performance ?**

In [None]:
from sklearn.metrics import accuracy_score
    
def get_processed_set(n_rows):
    sample=get_sample(n_rows,df)
    y=sample["Target"]
    X=sample.drop(columns=["Target"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    pca,N,vectorizer, Xtrain=preprocess_train_set(X_train)
    Xtest=preprocess_test(vectorizer,pca,N,X_test)
    return Xtrain, Xtest, y_train, y_test

def get_cv_score(train, test,y_train, y_test):
    model=LogisticRegression()
    ss=StandardScaler()
    scaled_test=ss.fit_transform(test)
    scaled_train=ss.fit_transform(train)
    def test_model_scaled(clf):
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
        scores = cross_val_score(clf,scaled_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
        return scores.mean()
    
    cv_score=test_model_scaled(model)
    model.fit(scaled_train,y_train)
    test_pred=model.predict(scaled_test)
    score_test=accuracy_score(y_test,test_pred)
    return cv_score, score_test

    

In [None]:
train_score=[]
test_score=[]
list_n=[200,500,1000,1500,3000,4500,6000,7000,8000]
train_shape=[]
for n in list_n:
    #get the sets
    Xtrain, Xtest, y_train, y_test=get_processed_set(n)
    N=Xtrain.shape[0]
    #Get scores values  
    cv_score, score_test=get_cv_score(Xtrain, Xtest, y_train, y_test)
    train_shape.append(N)
    train_score.append(cv_score)
    test_score.append(score_test)


In [None]:
#Display scores
fig=plt.figure(figsize=[10,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("Accuracy score according of the number \n of rows in the trainning set ",size=16)
plt.plot(train_shape,train_score,  linestyle='--', label='Train set ')
plt.plot(train_shape,test_score,  marker='.', label='Test set')
plt.xlabel("Number of rows")
plt.ylabel("Accuracy ")
plt.ylim(0.7,1)
plt.text(2500,0.72,"The testing set represent 1/3 of the total dataset")
plt.grid()
plt.legend()

# Conclusions :
The random Forest and the logistique regression are the most efficient models. The **logistic regression** being much faster we recommend to use it.

For the training set A sample of 4000 lines is enough to fit the TfidfVectorizer ,so that the models can then generalize in an efficient way.


Cross validation scores are much low on smaller samples

**Bonus: T-SNE visualisation .**

In [None]:
def convert_y_color(test):
    y_list=[]
    for i in test:
        if i=="True":
            y_list.append("#a11d31")
        else:
            y_list.append("#4277b2")
    return pd.Series(y_list)
Y=convert_y_color(y_train)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2,learning_rate=100)
x_new=tsne.fit_transform(Xtrain)

plt.scatter(x_new[:,0],x_new[:,1],c=Y)
plt.show()

In [None]:
tsne = TSNE(n_components=3,learning_rate=100)
x_new=tsne.fit_transform(Xtrain)


In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(1, figsize=(8, 6))
plt.title("3D Visualisation with t-SNE")
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(x_new[:,0],x_new[:,1],x_new[:,2], c=Y)