In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Let's get both the data
fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')
true = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')

In [None]:
# Fake news
fake.head()

In [None]:
# Real/True news
true.head()

In [None]:
# Let's insert a new column 'Real_or_Fake'.It will help when we combine both the tables in determining 
# Fake or Real news
fake['Real_or_Fake'] = 'Fake'
true['Real_or_Fake'] = 'Real'

**Combining fake and true table into one**

In [None]:
news = pd.concat([true,fake],axis=0,ignore_index=True)

# First 5 rows of the news table
news.head()

In [None]:
# Count of real and fake news
print(news['Real_or_Fake'].value_counts())

sns.countplot(x='Real_or_Fake',data=news)

In [None]:
# Let's check if there is any null value in text column.
news['text'].isnull().sum()

**There is no null value.But there might be empty string, we will deal with it later**

# Data Cleaning

**URL**

In [None]:
# Let's read one news
news['text'].iloc[33390]

**There might be a url provided in a news text which will not be useful as they provide no information in the form of text.We have to go to the url to obtain more information.Let's remove any url present in the text for all the news**

In [None]:
# Function to remove the url
def remove_url(text):
    text = text.split(' ')
    text1 = ''
    for word in text:
        if ('.com' in word) or ('https' in word) or ('bit.ly' in word):
            continue
        else:
            text1 += (word+' ')
    return text1

# Lets apply this on news text
news['text'] = news['text'].apply(remove_url)

**Empty String**

In [None]:
# Let's check if any news text is just an empty string

# empty will hold the index of the empty string text
empty = []

# for loop to find the empty string
for i,title,text,*_ in news.itertuples():
    if text.isspace() or text=='':
        empty.append(i)
        
# number of rows with empty string as form of news text
print(f"There are total {len(empty)} rows with empty string as news text")


In [None]:
# Let's remove these empty strings
news.drop(empty,inplace = True)

**HTML tags**

In [None]:
pip install beautifulsoup4

In [None]:
# Let's remove any HTML tags present in news text
# We can use BeautifulSoup to do it
from bs4 import BeautifulSoup

# function to remove the HTML tags
def remove_html(text):
    soup = BeautifulSoup(text)
    text = soup.get_text()
    
    return text

# Let's apply the above function on news text
news['text'] = news['text'].apply(remove_html)

In [None]:
# Let's read some more news
news['text'][0][:50]

In [None]:
news['text'][6][:50]

In [None]:
# WASHINGTON (Reuters) or SEATTLE/WASHINGTON (Reuters) are there at the begining of many news text.
# Our model might learn that if these words are at the begining of the news text,they must belong to one category and might 
# not try to learn from the text that follows.

# Let's go ahead and remove these words from the news text
# we will split the text on the basis of (Reuters) and ignore the first part

# function to perform the split
def split_news(text):
    if '(Reuters)' in text:
        text = text.split('(Reuters)')

        return ' '.join(text[1:])
    return text

# Applying the above function on the news text
news['text'] = news['text'].apply(split_news)

**Punctuation**

In [None]:
# Let's remove the punctuations from the news text
import string

punctuations = string.punctuation

# Lets add '\n','\n\n' and ' ' in punctuations
punctuations += '\n \n\n'

#function to remove the punctuations
def remove_punct(text):
    text = text.split(' ')
    text  = [word.lower() for word in text if word not in punctuations]
    
    return ' '.join(text)

# applying the above function in news text
news['text'] = news['text'].apply(remove_punct)

**Let's divide the data into X and y. X will be news text and y would be label- Fake or Real.**

In [None]:
X = news['text']
y = news['Real_or_Fake']

# Lets do the one hot encoding to convet y
encoded_y = pd.get_dummies(y,drop_first=True)

**1 in encoded_y means Real news and 0 means Fake news**

In [None]:
# Libraries to split the data into train and test data,create maodel and evaluating the matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,classification_report

In [None]:
# Lets split the data into train and test data. We will use 25% of the data as test data
X_train,X_test,y_train,y_test = train_test_split(X,encoded_y.values.reshape(-1,),test_size=0.25,random_state=42)

**Naive_bayes model**

In [None]:
# creating a naive model
pipeline_naive = Pipeline([
    ('vector',TfidfVectorizer(stop_words='english')),
    ('classifier',MultinomialNB())
])

# training the model
pipeline_naive.fit(X_train,y_train)

In [None]:
# prediction
predict_naive = pipeline_naive.predict(X_test)

In [None]:
# Evaluating the naive model
acc_naive = accuracy_score(y_test,predict_naive)
print(f'Naive model has {acc_naive.round(2)*100}% accuracy')
print('\n')
print(classification_report(y_test,predict_naive))

**Logistic Regression Model**

In [None]:
# creating a logistic model
pipeline_logistic = Pipeline([
    ('vector',TfidfVectorizer(stop_words='english')),
    ('classifier',LogisticRegression())
])

# training the model
pipeline_logistic.fit(X_train,y_train)

In [None]:
# prediction
predict_logistic = pipeline_logistic.predict(X_test)

In [None]:
# Evaluating the logistic model
acc_logistic = accuracy_score(y_test,predict_logistic)
print(f'LogisticRegression model has {acc_logistic.round(2)*100}% accuracy')
print('\n')
print(classification_report(y_test,predict_logistic))

**LinearSVC model**

In [None]:
# creating a LinearSVC model
pipeline_svc = Pipeline([
    ('vector',TfidfVectorizer(stop_words='english')),
    ('classifier',LinearSVC())
])

# training the model
pipeline_svc.fit(X_train,y_train)

In [None]:
# prediction
predict_svc = pipeline_svc.predict(X_test)

In [None]:
# Evaluating the LinearSVC model
acc_svc = accuracy_score(y_test,predict_svc)
print(f'LinearSVC model has {acc_svc.round(2)*100}% accuracy')
print('\n')
print(classification_report(y_test,predict_svc))

In [None]:
# Let's visualize the accuracy of all the three models
models = {
    'Naive':acc_naive,
    'Logistic':acc_logistic,
    'SVC':acc_svc
}

sns.set_style('darkgrid')
plt.plot(models.keys(),models.values(),marker='*',color='blue',markeredgecolor='red',markeredgewidth=4)
plt.xlabel('Models')
plt.ylabel('Accuracy')

# LinearSVC model performed better as compared to others with 99% accuracy