In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import required libraries
import nltk
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re

nltk.download("stopwords")
from nltk.corpus import stopwords

In [None]:
# Read the csv files
real_news = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv") 
fake_news = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")

In [None]:
real_news.head()

In [None]:
fake_news.head()

In [None]:
real_news['target'] = 1
fake_news['target'] = 0

In [None]:
real_news.head()

In [None]:
fake_news.head()

In [None]:
data = pd.concat([real_news, fake_news], ignore_index=True)
data.head()

In [None]:
data['text'] = data['subject'] + " "+ data['title'] + " "+ data['text']
data.head()

In [None]:
del data['title']
del data['subject']
del data['date']

**MODEL PREPROCESSING**

In [None]:
data.head()

In [None]:
sns.countplot(data['target'])

In [None]:
txt = data['text'][1]

ft = []
t = re.sub('[^a-zA-Z0-9]+', ' ', txt)
t = t.lower()
t = nltk.word_tokenize(t)

for word in t:
    if word not in stopwords.words("english"):
        lemma = nltk.WordNetLemmatizer()
        word = lemma.lemmatize(word)
        ft.append(word)
        a = " ".join(ft)
print(a)

In [None]:
#Removal of Punctuation Marks
def remove_punctuations(text):
    return re.sub('\[[^]]*\]', '', text)

# Removal of Special Characters
def remove_characters(text):
    return re.sub("[^a-zA-Z]"," ",text)

#Removal of stopwords 
def remove_stopwords_and_lemmatization(text):
    final_text = []
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    for word in text:
        if word not in set(stopwords.words('english')):
            lemma = nltk.WordNetLemmatizer()
            word = lemma.lemmatize(word) 
            final_text.append(word)
    return " ".join(final_text)

#Total function
def cleaning(text):
    #text = remove_html(text)
    text = remove_punctuations(text)
    text = remove_characters(text)
    text = remove_stopwords_and_lemmatization(text)
    return text


In [None]:
def get_clean_data(text):
    last_text = []
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    text = text.lower()
    text = nltk.word_tokenize(text)
    lemma = nltk.WordNetLemmatizer()
    for word in text:
        if word not in stopwords.words("english"):
            word = lemma.lemmatize(word)
            last_text.append(word)
            
    return " ".join(last_text)

In [None]:
data['text'] = data['text'].apply(cleaning)

In [None]:
data.head()

In [None]:
no_of_words = data['text'].apply(lambda x : len(x.split(" "))).sum()
print(f"Total number of words in the given dataset: {no_of_words}")

In [None]:
from wordcloud import WordCloud, STOPWORDS

plt.figure(figsize=(16, 10))
wc = WordCloud(max_words=500, width=1000, height=500, stopwords=STOPWORDS).generate(" ".join(data[data['target'] == 1].text))
plt.imshow(wc, interpolation='bilinear')

In [None]:
plt.figure(figsize=(16, 10))
wc = WordCloud(max_words=500, width=1000, height=500, stopwords=STOPWORDS).generate(" ".join(data[data['target'] == 0].text))
plt.imshow(wc, interpolation='bilinear')

**MODEL CREATION**

In [None]:
from sklearn.model_selection import train_test_split

x = data['text']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)

In [None]:
print(f"Length of the train data: {len(X_train)}, Length of the train data label: {len(y_train)}")
print(f"Length of the test data: {len(X_test)}, Length of the test label size: {len(y_test)}")

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [None]:
from sklearn.naive_bayes import MultinomialNB

mb_model = Pipeline([('vect', CountVectorizer()),
                    ('trans', TfidfTransformer()),
                    ('mb', MultinomialNB())])

mb_model.fit(X_train, y_train)

mb_pred = mb_model.predict(X_test)

print(f"Accuracy score of MultinominalNB: {accuracy_score(y_test, mb_pred)}")
print(f"Classification report of MultinominalNB: {classification_report(y_test, mb_pred)}")

In [None]:
sgd_model = Pipeline([('vect', CountVectorizer()),
                     ('trans', TfidfTransformer()),
                     ('clf', SGDClassifier())])

sgd_model.fit(X_train, y_train)

sgd_pred = sgd_model.predict(X_test)

print(f"Accuracy score of SGDclassifier model: {accuracy_score(y_test, sgd_pred)}")
print(f"Classification report of SGDclassifier model: {classification_report(y_test, sgd_pred)}")