# Trying Different Models on the Fake News Data

## Loading the data

In [None]:
import pandas as pd

# load the first dataset
news_dataset = pd.read_csv("../input/fake-news-dataset/train.csv")

In [None]:
news_dataset.info()

In [None]:
news_dataset.head()

In [None]:
news_dataset['class'].value_counts()

There seems to be a single wrong value in the class column

In [None]:
news_dataset[news_dataset['class'] == 'February 5, 2017']

In [None]:
news_dataset['Unnamed: 6'].value_counts()

The record seems to have been shifted to the right due to the id value being repeated at the beginning.

In [None]:
import numpy as np

# shifting the column values in the respective places
news_dataset.iloc[504, 2] = news_dataset.iloc[504, 3]
news_dataset.iloc[504, 3] = news_dataset.iloc[504, 4]
news_dataset.iloc[504, 4] = news_dataset.iloc[504, 5]
news_dataset.iloc[504, 5] = news_dataset.iloc[504, 6]
news_dataset.iloc[504, 6] = np.nan

In [None]:
news_dataset.iloc[504]

In [None]:
news_dataset.drop(columns=['index', 'Unnamed: 6'], inplace=True)

In [None]:
news_dataset.info()

Saving the fixed dataset.

In [None]:
news_dataset.to_csv('news_dataset.csv', index=False)

In [None]:
# sanity check
news_dataset = pd.read_csv('news_dataset.csv')
news_dataset.info()

In [None]:
news_dataset.head()

## Preparing the Data for Machine Learning Models

In [None]:
from sklearn.model_selection import train_test_split

# splitting the dataset into training and test sets
features, target = news_dataset[['title', 'text']], (news_dataset['class'] == 'Fake').astype(int)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, stratify=target, random_state=42)
x_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
print(x_train.iloc[0]['title'])

In [None]:
# standardize the title and text by removing punctuation, links, mentions and converting to lowercase
def standardize_text(x):
    """
    x: a Pandas Series of strings (texts)
    """
    
    x = x.str.replace(r"http\S+", " ")
    x = x.str.replace(r"http", " ")
    x = x.str.replace(r"[^A-Za-z0-9()\ ]", " ")
    x = x.str.replace(r"@", "at")
    x = x.str.replace(r"@\S+", "")
    x = x.str.replace(r"\s+", " ")
    x = x.str.lower()
    return x

print(standardize_text(x_train.iloc[0:1]['title']).values)

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

english_stopwords = stopwords.words('english')

def remove_stopwords(x):
    """
    x: a Pandas Series of strings (texts)
    """
    
    word_tokenizer = RegexpTokenizer(r"\w+")
    x = x.apply(lambda x: " ".join([t for t in word_tokenizer.tokenize(x) if t not in english_stopwords]))
    return x

print(remove_stopwords(standardize_text(x_train.iloc[0:1]['title'])).values)

In [None]:
from collections import Counter

class Vocab(object):
    def __init__(self, token_series, vocab_size=None, special=[]):
        """
        token_series: a Pandas Series of token lists

        vocab_size is the maximum length of the vocabulary (before adding the special tokens)
        """
        vocabulary = Counter()
        for x in token_series:
            vocabulary.update(x)

        # truncate vocabulary
        if vocab_size is None:
            vocab_size = len(vocabulary)
        vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]    

        # add special tokens in the keys
        vocabulary = ['<unk>', '<pad>'] + special + vocabulary
        self.unk = 0
        self.pad = 1
        self.idx_to_token = vocabulary
        self.token_to_idx = {tok: i for i, tok in enumerate(vocabulary)}
    
    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[i] for i in indices]

In [None]:
import numpy as np
from sklearn.base import BaseEstimator
    
class Tokenizer(BaseEstimator):
    def __init__(self, preprocess=None, vocab_size=None, special=[]):
        self.preprocess = lambda x: preprocess(pd.Series(x))
        self.vocab_size = vocab_size
        self.special = special
        self.word_tokenizer = RegexpTokenizer(r"\w+")
    
    def fit(self, X, y=None):
        """
        X: a Pandas Series of strings
        """
        
        X = self.preprocess(X)
        X = X.apply(self.word_tokenizer.tokenize)
        self.vocab = Vocab(X, self.vocab_size, self.special)
        return self
    
    def transform(self, X):
        """
        Converts a Series of strings to a numpy array of lists of indices
        
        X: a Pandas Series of strings
        """
        
        X = self.preprocess(X)
        X = X.apply(self.word_tokenizer.tokenize)
        return np.array([self.vocab[x] for x in X])
    
    def fit_transform(self, X, y=None):
        X = self.preprocess(X)
        X = X.apply(self.word_tokenizer.tokenize)
        self.vocab = Vocab(X, self.vocab_size, self.special)
        return np.array([self.vocab[x] for x in X])
    
    def to_tokens(self, indices):
        """
        Converts a numpy array of lists of indices to a numpy array of tokens
        """
        
        return np.array([nostop_tokenizer.vocab.to_tokens(list(x)) for x in example_indices])

In [None]:
nostop_tokenizer = Tokenizer(lambda x: remove_stopwords(standardize_text(x)), vocab_size=10000)
nostop_tokenizer.fit(pd.concat([x_train['title'], x_train['text'][:5000]]))  # TODO: use all the text data

In [None]:
example_indices = nostop_tokenizer.transform(x_train.iloc[0:2]['title'])
print(example_indices)

In [None]:
for sent in (' '.join(x) for x in nostop_tokenizer.to_tokens(example_indices)):
    print(sent)

In [None]:
# TODO: create a vocabulary and tokenizer for each of these cases:
#  1. with stopwords [done]
#  2. without stopwords
#  3. with special tokens such as <UNK>, <PAD>, <SOS>, <EOS> for deep learning models

# TODO: for deep learning models:
#  batch the text by using the <PAD> tokens
#  use word embeddings (pretrained as well as random)

## Evaluating Different Machine Learning Models

### Bag of Words + Logistic Regression

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# TODO: create custom count vectorizer