# Exploring the Fake News Data

## Loading the data

In [None]:
import pandas as pd

# load the first dataset
news_dataset = pd.read_csv("../input/fake-news-dataset/train.csv")

In [None]:
news_dataset.info()

In [None]:
news_dataset.head()

In [None]:
news_dataset['class'].value_counts()

There seems to be a single wrong value in the class column

In [None]:
news_dataset[news_dataset['class'] == 'February 5, 2017']

In [None]:
news_dataset['Unnamed: 6'].value_counts()

The record seems to have been shifted to the right due to the id value being repeated at the beginning.

In [None]:
import numpy as np

# shifting the column values in the respective places
news_dataset.iloc[504, 2] = news_dataset.iloc[504, 3]
news_dataset.iloc[504, 3] = news_dataset.iloc[504, 4]
news_dataset.iloc[504, 4] = news_dataset.iloc[504, 5]
news_dataset.iloc[504, 5] = news_dataset.iloc[504, 6]
news_dataset.iloc[504, 6] = np.nan

In [None]:
news_dataset.iloc[504]

In [None]:
news_dataset.drop(columns=['index', 'Unnamed: 6'], inplace=True)

In [None]:
news_dataset.info()

Saving the fixed dataset.

In [None]:
news_dataset.to_csv('news_dataset_1.csv', index=False)

In [None]:
# sanity check
news_dataset = pd.read_csv('news_dataset_1.csv')
news_dataset.info()

In [None]:
news_dataset.head()

## Exploring the features

In [None]:
for col in news_dataset.columns:
    print(news_dataset.iloc[0][col] + "\n")

In [None]:
news_dataset['subject'].value_counts()

In [None]:
news_dataset['class'].value_counts() / news_dataset.shape[0]

Notes about the data:

* text: this dataset seems to have the apostrophe (single quote character) removed. But this is okay since all punctuation will probably be removed during data preparation. Text seems to have links and mentions (e.g. @RogerJStoneJr) that are probably useless.
* subject: I don't think I will use this column as a feature since I want the model to detect fakeness purely based on the title and text. Also most of the categories can be considered "politics" so I don't think it's going to be very useful.
* date: I won't use the date as a feature.
* class: This is the target the model should try to predict. It is a string that can be either 'Fake' or 'Real'. Needs to be converted to binary (1 or 0). The ratio of fake to real articles in the dataset is about 52% to 48%.

The dataset has no missing (nan) values.

## Visualizing the data

Standardize the data by removing punctuation, links, mentions and stopwords

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r"\w+")

def tokenize_text(x):
    """
    x: a Pandas Series
    
    returns a pandas series of token (str) lists
    """
    
    return x.apply(tokenizer.tokenize)

def standardize_text(x):
    """
    x: a Pandas Series
    """
    
    x = x.str.replace(r"http\S+", "")
    x = x.str.replace(r"http", "")
    x = x.str.replace(r"@\S+", "")
    x = x.str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n\ ]", "")
    x = x.str.replace(r"@", "at")
    x = x.str.lower()
    return x

english_stopwords = stopwords.words('english')

def remove_stopwords(token_list):
    token_list = [tok for tok in token_list if tok not in english_stopwords]
    return token_list

def remove_stopwords_from_series(x):
    """
    x: a Pandas Series of token lists
    """
    
    x = x.apply(remove_stopwords)
    return x

def standard_tokens_from_text(x):
    """
    x: a Pandas Series of strings
    """
    
    x = standardize_text(x)
    x = tokenize_text(x)
    x = remove_stopwords_from_series(x)
    return x

In [None]:
news_dataset['title_tokens'] = standard_tokens_from_text(news_dataset['title'])
news_dataset['text_tokens'] = standard_tokens_from_text(news_dataset['text'])

In [None]:
news_dataset['title_tokens']

In [None]:
news_dataset.info()

Find the most common unigrams and bigrams in fake and real news.

In [None]:
# separate the dataset into fake and real
fake_news = news_dataset[news_dataset['class'] == 'Fake']
real_news = news_dataset[news_dataset['class'] == 'Real']

In [None]:
news_dataset[(news_dataset['title'].apply(lambda x: len(x)<=10))]

In [None]:
from collections import Counter
from nltk.util import ngrams

def count_ngrams(token_list, counter, n):
    if len(token_list) >= n:
        counter.update(ngrams(token_list, n))

# count anagrams in the titles of fake news
fake_title_unigram_counts = Counter()
fake_news['title_tokens'].apply(lambda x: count_ngrams(x, fake_title_unigram_counts, 1));

# count anagrams in the titles of real news
real_title_unigram_counts = Counter()
real_news['title_tokens'].apply(lambda x: count_ngrams(x, real_title_unigram_counts, 1));

In [None]:
# count bigrams in the titles of fake news
fake_title_bigram_counts = Counter()
fake_news['title_tokens'].apply(lambda x: count_ngrams(x, fake_title_bigram_counts, 2));

# count bigrams in the titles of real news
real_title_bigram_counts = Counter()
real_news['title_tokens'].apply(lambda x: count_ngrams(x, real_title_bigram_counts, 2));

In [None]:
# count trigrams in the titles of fake news
fake_title_trigram_counts = Counter()
fake_news['title_tokens'].apply(lambda x: count_ngrams(x, fake_title_trigram_counts, 3));

# count trigrams in the titles of real news
real_title_trigram_counts = Counter()
real_news['title_tokens'].apply(lambda x: count_ngrams(x, real_title_trigram_counts, 3));

In [None]:
import nltk
import matplotlib.pyplot as plt

plt.style.use('ggplot')
plt.title('Top 20 Unigrams in Fake News')
nltk.FreqDist(fake_title_unigram_counts).plot(20, cumulative=False, color = 'r');

plt.title('Top 20 Unigrams in Real News')
nltk.FreqDist(real_title_unigram_counts).plot(20, cumulative=False, color = 'b');

In [None]:
plt.title('Top 20 Bigrams in Fake News')
nltk.FreqDist(fake_title_bigram_counts).plot(20, cumulative=False, color = 'r');

plt.title('Top 20 Bigrams in Real News')
nltk.FreqDist(real_title_bigram_counts).plot(20, cumulative=False, color = 'b');

In [None]:
plt.title('Top 20 Trigrams in Fake News')
nltk.FreqDist(fake_title_trigram_counts).plot(20, cumulative=False, color = 'r');

plt.title('Top 20 Trigrams in Real News')
nltk.FreqDist(real_title_trigram_counts).plot(20, cumulative=False, color = 'b');