# Inspiration:
Can you use this data set to make an algorithm able to determine if an article is fake news or not ?

[Dataset](https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset)

In [None]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
true = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')
fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')

In [None]:
print(true.info())
print('='*50)
print(fake.info())

In [None]:
true['target'] = 1
fake['target'] = 0
df = pd.concat([true,fake], ignore_index=True)
df.info()

In [None]:
df['combined'] = df['subject'] + df['title'] + df['text']

In [None]:
df['combined'] = df.text.apply(lambda x: x.lower())
display(df.head())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.combined, df.target, test_size=.25, random_state=123, stratify=df.target)

In [None]:
for i in [X_train, X_test, y_train, y_test]:
    print(i.shape)
    print('\n')

In [None]:
print('y_train distribution:')
print(y_train.value_counts())
print('y_test distribution:')
print(y_test.value_counts())

# Using Logistic Regression

In [None]:
pipeline = Pipeline([('vect',CountVectorizer(stop_words='english')),
                     ('model',LogisticRegression())])
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

In [None]:
print('accuracy: {:.2f}%'.format(accuracy_score(y_test,pred)*100))
cm = confusion_matrix(y_test,pred)
sns.heatmap(cm, cmap = 'Blues', annot= True, fmt = 'd', xticklabels = ['fake','real'], yticklabels = ['fake','real'])
plt.show()

In [None]:
print(classification_report(y_test,pred, target_names=['fake','real']))

In [None]:
print(accuracy_score(y_test,pred))

# Using Naive Bayes

In [None]:
pipeline = Pipeline([('vect',CountVectorizer(stop_words='english')),
                     ('model',MultinomialNB())])
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

In [None]:
print('accuracy: {:.2f}%'.format(accuracy_score(y_test,pred)*100))
cm = confusion_matrix(y_test,pred)
sns.heatmap(cm, cmap = 'Blues', annot= True, fmt = 'd', xticklabels = ['fake','real'], yticklabels = ['fake','real'])
plt.show()

In [None]:
print(classification_report(y_test,pred, target_names=['fake','real']))

In [None]:
print(accuracy_score(y_test,pred))

### We achieved more than 99.6% Accuracy using Logistic Regression and around 95% using Multinomial Naive Bayes.

## Short amd crisp to the point. Hope you like it.