In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud

In [None]:
true_d = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')
fake_d = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')

In [None]:
true_d.head(10)

In [None]:
fake_d.head(10)

In [None]:
fake_d['target'] = 0
true_d['target'] = 1

In [None]:
df = pd.concat([fake_d,true_d], ignore_index = True, sort = False)

In [None]:
print(df.shape)

In [None]:
plt.figure(figsize=(15,7))
ax = sns.countplot(y = df.loc[df['target']==0]['subject'],label='Fake')
ax.set_title('Distribution')
ax.legend()

In [None]:
df1 = df

In [None]:
df1['title'] = df1['title'] + ' ' + df1['subject']

In [None]:
fake_words = ' '.join(list(df1[df1['target']==0]['title']))
spam_wc = WordCloud(width = 1000, height=500).generate(fake_words)
plt.figure(figsize=(10,10))
plt.imshow(spam_wc)
plt.axis('off')
plt.show()

In [None]:
df.info()

In [None]:
df['date']

In [None]:
import datetime as dt

In [None]:
df['date'] = df['date'].str.replace(' ', '')

In [None]:
for i, val in enumerate(df['date']):
    df['date'].iloc[i] = pd.to_datetime(df['date'].iloc[i], format='%B%d,%Y', errors='coerce')

In [None]:
df['date'] = df['date'].astype('datetime64[ns]')

In [None]:
df['year'] = pd.to_datetime(df['date']).dt.to_period('Y')
df['month'] = pd.to_datetime(df['date']).dt.to_period('M')

df['month'] = df['month'].astype(str)

In [None]:
df2 = df[['month', 'target']]
df2 = df2.groupby(['month'])['target'].sum()

In [None]:
df2

In [None]:
df2 = df2.drop('NaT')

In [None]:
plt.plot(df2.index, df2.values, linewidth=2)
plt.suptitle('Dynamics of fake news')
plt.xticks(rotation=90)
plt.ylabel('Number of fake news')
plt.xlabel('Month-Year')

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
df1['title'] = df1['title'].apply(lambda x: word_tokenize(str(x)))
stemmer = PorterStemmer()
df1['title'] = df1['title'].apply(lambda x: [stemmer.stem(y) for y in x])
stopword = stopwords.words('english')
df1['title'] = df1['title'].apply(lambda x: [y for y in x if y not in stopword])


In [None]:
df1['title'] = df1['title'].apply(lambda x: ' '.join(x))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfid = TfidfVectorizer()
X = tfid.fit_transform(df1['title'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df1['target'], train_size=0.7, random_state=0)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mult = MultinomialNB()
mult.fit(X_train, y_train, sample_weight=None)

In [None]:
y_pred = mult.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [None]:
print(accuracy_score(y_test,y_pred))

In [None]:
conf = confusion_matrix(y_test,y_pred)
ax = sns.heatmap(conf, annot = True)
ax.set(xlabel = 'predict',ylabel='True')

In [None]:
from xgboost import XGBClassifier

In [None]:
xg = XGBClassifier(eval_metric='rmse', use_label_encoder=False)
xg.fit(X_train, y_train)

In [None]:
y_pred1 = xg.predict(X_test)

In [None]:
print(accuracy_score(y_test,y_pred1))

In [None]:
conf = confusion_matrix(y_test,y_pred1)
ax = sns.heatmap(conf, annot = True)
ax.set(xlabel = 'predict',ylabel='True')