---

<h1 style="text-align: center;font-size: 30px; color: #013b86;">Fake or Real. Two sides of the same coin?</h1>

---

<center><img style="width: 700px;" src="https://images.ctfassets.net/yqezig6gzu6c/5ur280lovm0DKoHmlEGe1P/9779c9555ebed3dbd7e3445d0a666843/https___cdn2.hubspot.net_hubfs_656775_Fact_20Fake_201200_20x_20627px_2x-100_20copy.jpg?w=900&q=100"></center>

---
<i>Source: Image from Google</i>

<h4>Oh please don't be surprised about this title. This title represents only this notebook. Actually this notebook is divided into two parts. </h4>
<h4>In the first part we predict the use of this dataset where we get almost 100% accuracy and in the second part we try to examine this dataset. Whether it is biased or not.</h4>
<h4>So when we get almost 100% accuracy you can say yes this title is true. However, after observing this dataset, your observation may or may not change. If you think I can't see anything wrong with this dataset, you can say this title is ok. Or if you see some biased material in this dataset, you can change your mind. So this is a question to you, what do you think?</h4>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re
import string

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
real_data = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake_data = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

<h2 style="font-size: 30px;color: #ae2e28;">Part - 1</h2>

## Basic EDA

In [None]:
real_data.head()

In [None]:
fake_data.head()

In [None]:
real_data.info()

In [None]:
fake_data.info()

In [None]:
real_data['target'] = 1
fake_data['target'] = 0 

In [None]:
fake_data.tail()

In [None]:
combine_data = pd.concat([real_data, fake_data], ignore_index=True, sort=False)
combine_data.tail()

In [None]:
plt.figure(figsize=(7, 7))
sns.set(style="darkgrid")

color = sns.color_palette("Set2")
ax = sns.countplot(x="target", data=combine_data, palette=color)

ax.set(xticklabels=['fake', 'real'])

plt.title("Data distribution of fake and real data")

In [None]:
plt.figure(figsize=(15, 10))
sns.set(style="darkgrid")

color = sns.color_palette("Set2")
ax = sns.countplot(x="subject",  hue='target', data=combine_data, palette=color)

# ax.set(xticklabels=['fake', 'real'])

plt.title("Data distribution of fake and real data")

In [None]:
combine_data.isnull().sum()

## Data Cleaning

In [None]:
import re

In [None]:
def clean_train_data(x):
    text = x
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # remove square brackets
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    text = re.sub(r'http\S+', '', text)
    text = re.sub('\n', '', text)
    return text

In [None]:
clean_combine_data = combine_data.copy()
clean_combine_data['text'] = combine_data.text.apply(lambda x : clean_train_data(x))
clean_combine_data.head()

In [None]:
clean_combine_data.tail()

In [None]:
# clean_combine_data[clean_combine_data['target'] == 0]['text'][21417]

In [None]:
# fake_data['text'][0]

## Stopword Removal

In [None]:
eng_stopwords = nltk.corpus.stopwords.words("english")

In [None]:
def remove_eng_stopwords(text):
    token_text = nltk.word_tokenize(text)
    remove_stop = [word for word in token_text if word not in eng_stopwords]
    join_text = ' '.join(remove_stop)
    return join_text

In [None]:
stopword_combine_data = clean_combine_data.copy()
stopword_combine_data['text'] = clean_combine_data.text.apply(lambda x : remove_eng_stopwords(x))
stopword_combine_data.head()

## Find out common words

In [None]:
from itertools import chain
from collections import Counter

In [None]:
list_words = stopword_combine_data['text'].str.split()
list_words_merge = list(chain(*list_words))

d = Counter(list_words_merge)
df = pd.DataFrame(data=d, index=['count'])
top_common_words = df.T.sort_values(by=['count'], ascending=False).reset_index().head(50)
top_common_words.head()

In [None]:
plt.figure(figsize=(15,7))
sns.set(style="darkgrid")
sns.barplot(x="index", y='count', data=top_common_words)
plt.xticks(rotation=90)

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
lemm = WordNetLemmatizer()

In [None]:
def word_lemmatizer(text):
    token_text = nltk.word_tokenize(text)
    remove_stop = [lemm.lemmatize(w) for w in token_text]
    join_text = ' '.join(remove_stop)
    return join_text


In [None]:
lemmatize_data = stopword_combine_data.copy()
lemmatize_data['text'] = stopword_combine_data.text.apply(lambda x : word_lemmatizer(x))
lemmatize_data.head()

# N-Gram Analysis

---

<center><img style="width: 700px;" src="https://images.deepai.org/glossary-terms/867de904ba9b46869af29cead3194b6c/8ARA1.png"></center>

---
<i>Source: Image from Google</i>

In [None]:
string = ' '.join(lemmatize_data['text'])

In [None]:
str_val = string.split(" ")

## Unigram Analysis

In [None]:
data_unigram=(pd.Series(nltk.ngrams(str_val, 1)).value_counts())[:30]

In [None]:
data_unigram_df=pd.DataFrame(data_unigram)
data_unigram_df = data_unigram_df.reset_index()
data_unigram_df = data_unigram_df.rename(columns={"index": "key", 0: "value"})
data_unigram_df.head()

In [None]:
plt.figure(figsize = (16,9))
sns.barplot(x='value',y='key', data=data_unigram_df)

## Bigram Analysis

In [None]:
data_bigram=(pd.Series(nltk.ngrams(str_val, 2)).value_counts())[:30]

In [None]:
data_bigram_df=pd.DataFrame(data_bigram)
data_bigram_df = data_bigram_df.reset_index()
data_bigram_df = data_bigram_df.rename(columns={"index": "key", 0: "value"})
data_bigram_df.head()

In [None]:
plt.figure(figsize = (16,9))
sns.barplot(x='value',y='key', data=data_bigram_df)

## Trigram Analysis

In [None]:
data_trigram=(pd.Series(nltk.ngrams(str_val, 3)).value_counts())[:30]

In [None]:
data_trigram_df=pd.DataFrame(data_trigram)
data_trigram_df = data_trigram_df.reset_index()
data_trigram_df = data_trigram_df.rename(columns={"index": "key", 0: "value"})
data_trigram_df.head()

In [None]:
plt.figure(figsize = (16,9))
sns.barplot(x='value',y='key', data=data_trigram_df)

#### Wow they incredibly use Donald Trump too many times. What u think is this ok?

# Modeling

In [None]:
model_data = stopword_combine_data.copy()

In [None]:
model_data['combine_text'] = model_data['subject'] + " " + model_data['title'] + " " + model_data['text']
del model_data['title']
del model_data['subject']
del model_data['date']
del model_data['text']
model_data.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(model_data['combine_text'], model_data['target'], random_state=0)

# Bag of Words

---

<center><img style="width: 700px;" src="https://3.bp.blogspot.com/-4pxORQAgAFI/XMNZhEssXtI/AAAAAAAAGmA/SuQGsp-GyT4jKlUZieg_A5lnTza_GujfwCLcBGAs/s1600/bag_of_words.png"></center>

---
<i>Source: Image from Google</i>

## Vectorizing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vec_train = CountVectorizer().fit(X_train)
X_vec_train = vec_train.transform(X_train)

In [None]:
X_vec_test = vec_train.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [None]:
model = LogisticRegression()
model.fit(X_vec_train, y_train)

In [None]:
predicted_value = model.predict(X_vec_test)

In [None]:
accuracy_value = roc_auc_score(y_test, predicted_value)
print(accuracy_value)

### ohh you see this result it is 99.86 means this title is almost real. but actually is it? u might be found somethings wrong in model execution. ok lets do some in modeling to do more reliable. 

# Modeling -2

In [None]:
model_2_data = lemmatize_data.copy()
model_2_data['combine_text'] = model_2_data['title'] + " " + model_2_data['text']
del model_2_data['title']
del model_2_data['subject']
del model_2_data['date']
del model_2_data['text']
model_2_data.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(model_2_data['combine_text'], model_2_data['target'], test_size=0.33, random_state=0)

In [None]:
vec_train = CountVectorizer().fit(X_train)
X_vec_train = vec_train.transform(X_train)
X_vec_test = vec_train.transform(X_test)

In [None]:
model = LogisticRegression()
model.fit(X_vec_train, y_train)
predicted_value = model.predict(X_vec_test)
accuracy_value = roc_auc_score(y_test, predicted_value)

In [None]:
print(accuracy_value)

### so our new predicted result is 99.66, not much difference from previous one. though im not doing much work in modeling. but i can assure you this dataset always given u above 90% accuracy.

### why this is. is it really easy to find out which news are fake and which are real. i don't know. but i want to show u something about this dataset. 

### I'm much inspired from this notebook. You can check also. Im getting some idea from this notebook to knowing you about this dataset.
[https://www.kaggle.com/josutk/only-one-word-99-2](https://www.kaggle.com/josutk/only-one-word-99-2)

<h2 style="font-size: 30px;color: #ae2e28;">Part - 2</h2>

# Deep drive in this Dataset

## Fact-1: Subject Distribution

In [None]:
ex_combine_data = combine_data.copy()
ex_combine_data = ex_combine_data.replace(["politicsNews"], 'politics')
ex_combine_data.head()

In [None]:
plt.figure(figsize=(15, 10))
sns.set(style="darkgrid")

color = sns.color_palette("Set2")
ax = sns.countplot(x="subject",  hue='target', data=ex_combine_data, palette=color)

# ax.set(xticklabels=['fake', 'real'])

plt.title("Data distribution of fake and real data")

#### Subjects are not well distributed The real data contains only two subjects and the fake data contains the remaining subjects. Only Politics are common.

#### So the question is, does this description follow their titles? Otherwise we can say that any news we get about the US or the Middle East is completely fake. Does it make sense?

## Fact-2: Text Length

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(17,8))

word = ex_combine_data[ex_combine_data['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='blue')
ax1.set_title('Real text')

word = ex_combine_data[ex_combine_data['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='red')
ax2.set_title('Fake text')

fig.suptitle('Average word length in each text')

#### Average text length are not same for both. And the difference are really remarkable. Yes length can not be same but this difference is huge. Maybe it can be hamper some training model.

## Fact-3: Unique Words

In [None]:
all_words_after = real_data['text'].str.split()
merged = list(chain(*all_words_after))
d = Counter(merged)
df = pd.DataFrame(data=d, index=['count'])
top_count_words = df.T.sort_values(by=['count'], ascending=False).reset_index().head(50)
top_count_words.head()

In [None]:
from collections import Counter
results = Counter()
real_data['text'].str.lower().str.split().apply(results.update)
real_unq_count = len(results)
print(real_unq_count)

In [None]:
results = Counter()
fake_data['text'].str.lower().str.split().apply(results.update)
fake_unq_count = len(results)
print(fake_unq_count)

In [None]:
plt.figure(figsize=(8,8))
plt.bar([1, 2], [real_unq_count, fake_unq_count], color=['#72b6a1', '#e99675'])
plt.xticks([1,2], ('real', 'fake'))
plt.show()

#### Earlier we saw that the average text length of fake information is not very long, but in unique words it appears higher than the real data. That's mean, the ratio of same words is too much. What Do you think is this okay for prediction?

## So What is yours finding, is this title real or fake?