In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# news topics Classification using TF*IDF and 17 Machine learning classifiers

In [None]:
real_news = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

In [None]:
r_news = real_news.copy()

### Explore the dataset

In [None]:
r_news.head()

In [None]:
r_news.tail()

In [None]:
r_news

In [None]:
r_news.describe()

In [None]:
r_news.info()

In [None]:
r_news.isna().sum()

### Clean HTML tags

In [None]:
def CleanHTMLText(Text):
    Text = Text.str.replace('(<br/>)', '')
    Text = Text.str.replace('(<a).*(>).*(</a>)', '')
    Text = Text.str.replace('(&amp)', '')
    Text = Text.str.replace('(&gt)', '')
    Text = Text.str.replace('(&lt)', '')
    Text = Text.str.replace('(\xa0)', '')
    return Text

In [None]:
r_news['title'] = CleanHTMLText(r_news['title'])
r_news['text'] = CleanHTMLText(r_news['text'])

### Drop empty Rows

In [None]:
emp_index = [index for index,text in enumerate(r_news.text.values) if str(text).strip() == '']
print(str(len(emp_index))+'  Rows in real news with empty text')

In [None]:
r_news.iloc[emp_index]

In [None]:
r_news = r_news.drop(emp_index, axis=0)
del emp_index

### Drop duplicates rows

In [None]:
r_news = r_news.drop_duplicates(subset = ['title','text'])

### Data visualization

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.pie(r_news.subject.value_counts(), labels = r_news.subject.unique(),autopct='%1.2f%%')
plt.show()

In [None]:
r_news['subject'].value_counts()
plt.figure(figsize = (5,10))
sns.set_style("darkgrid")
sns.countplot(r_news['subject'])

#### We note that the data are balanced, meaning that the size of the data in both types is close, which is important because it ensures that there is no bias.

### wordcloud 

In [None]:
from nltk.corpus import stopwords
from wordcloud import WordCloud
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                 stopwords = stopwords.words('english'),
                min_font_size = 10).generate(" ".join(r_news.text))   
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show() 

#### In the word cloud, we notice the presence of an uppercase letter U, which is due to the omission of the letter S from the word U.S. , where S is among the stop words in the nltk library,But it will not affect the classification results

### Create a new column to be used in the time series and in the classification process

In [None]:
r_news['label'] = r_news['subject'].map( 
                   {'politicsNews':0 ,'worldnews':1})

In [None]:
r_news['label']

In [None]:
r_news.date = pd.to_datetime(r_news.date, errors="coerce")
news_grouped = r_news[["date", "subject", "label"]].groupby(["date", "label"]).count().reset_index()
fig, ax = plt.subplots(figsize=(16,10))
sns.lineplot(x="date", y="subject", hue="label", data=news_grouped, palette="Set2", ax=ax)
plt.title("News Articles Labelled Fake vs. Real")
plt.xlabel("Time")
plt.ylabel("Count")

#### We note that worldnews was collected in the last 5 months, while politicsNews was collected over a period of 24 months, but we cannot delete these values because their number is large and deleting them will affect the balance of data and the classification process.

#### 

### In the classification process, we will follow the 20/80 rule, where we will give 80% for training classifiers and 20% for testing.

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(r_news['title'],r_news['label'],test_size=0.2, random_state = 1)

#### We will compare 6 performance measures together " accuracy ,confusion_matrix ,Precision ,Recall ,F1 and the time required to implement "

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics                                                 
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
import datetime

In [None]:
from sklearn.naive_bayes import MultinomialNB
a=datetime.datetime.now()
pip_ln = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])
model = pip_ln.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('MultiNomial Naive Bayes Classifier')
plt.show()
b=datetime.datetime.now()
b-a

In [None]:
from sklearn.ensemble import RandomForestClassifier
a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',   RandomForestClassifier())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('RandomForest Classifier')
plt.show()
b=datetime.datetime.now()
b - a

In [None]:
from sklearn.svm import LinearSVC
a=datetime.datetime.now()
pip_ln = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC())
])
model = pip_ln.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('LinearSVC Classifier')
plt.show()
b=datetime.datetime.now()
b-a

In [None]:
from sklearn.linear_model import LogisticRegression
a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',   LogisticRegression())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('LogisticRegression Classifier')
plt.show()
b=datetime.datetime.now()
b - a

In [None]:
from sklearn.tree import DecisionTreeClassifier
a=datetime.datetime.now()
pip_ln = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',  DecisionTreeClassifier())
])
model = pip_ln.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('DecisionTree Classifier')
plt.show()
b=datetime.datetime.now()
b-a

In [None]:
from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
a=datetime.datetime.now()
pip_ln = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',  KNeighborsClassifier())
])
model = pip_ln.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('KNeighbors Classifier')
plt.show()
b=datetime.datetime.now()
b-a

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',  PassiveAggressiveClassifier())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('PassiveAggressive Classifier')
plt.show()
b=datetime.datetime.now()
b - a

In [None]:
from sklearn.linear_model import SGDClassifier

a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',   SGDClassifier())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('SGD Classifier')
plt.show()
b=datetime.datetime.now()
b - a

In [None]:
from sklearn.naive_bayes import BernoulliNB
a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',   BernoulliNB())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('BernoulliNB Classifier')
plt.show()
b=datetime.datetime.now()
b - a

In [None]:
from sklearn.naive_bayes import ComplementNB
a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',   ComplementNB())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('ComplementNB Classifier')
plt.show()
b=datetime.datetime.now()
b - a

In [None]:
from sklearn.neighbors import NearestCentroid
a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',   NearestCentroid())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('NearestCentroid Classifier')
plt.show()
b=datetime.datetime.now()
b - a

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',   AdaBoostClassifier())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('AdaBoost Classifier')
plt.show()
b=datetime.datetime.now()
b - a

In [None]:
 from sklearn.ensemble import GradientBoostingClassifier
a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',   GradientBoostingClassifier())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('GradientBoosting Classifier')
plt.show()
b=datetime.datetime.now()
b - a

In [None]:
from sklearn.tree import ExtraTreeClassifier
a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',   ExtraTreeClassifier())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('Extra Tree Classifier')
plt.show()
b=datetime.datetime.now()
b - a

In [None]:
from sklearn.ensemble import BaggingClassifier
a=datetime.datetime.now()
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',   BaggingClassifier())
])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
score = metrics.accuracy_score(y_test, prediction)
score_per = metrics.precision_score(y_test, prediction)
score_re = metrics.recall_score(y_test, prediction)
score_F1 = metrics.f1_score(y_test, prediction)
print("accuracy:    %0.3f" % (score*100))
print("Precision:   %0.3f" % (score_per*100))
print("Recall:      %0.3f" % (score_re*100))
print("F1 Score:    %0.3f" % (score_F1*100))
cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.suptitle('Bagging Classifier')
plt.show()
b=datetime.datetime.now()
b - a

##### THE END...