# Fake News Detection - Embeddings + Neural Networks: Content

1. Initial Data Cleaning and Exploration
    * Checking for and removing duplicate news
    * Deciding which features to use for analysis by checking for relationship between features and         labels.
    
    
2. Data Preprocessing
    * Removing punctions and unneeded characters from news text.
    * Removing stop words
    * tokenization
    * stemmatization


3. Feature Extraction and Model Training
    * Using TF-IDF and basic classification algorithms(Naive Bayes and Logistic Regression)
    * Using Word embeddings from scracth + neural networks
    * Using pre-trained word embeddings(GloVe) + neural networks

In [None]:


import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pip install BeautifulSoup4

In [None]:
fake_news = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
fake_news['credibility'] = 0
fake_news

In [None]:
real_news = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')
real_news['credibility'] = 1
real_news

## Data Exploration

In [None]:
# checking for the content of some rows
pd.set_option('max_colwidth', None)
all_news = fake_news.append(real_news, ignore_index=True)
all_news.sample(3)

In [None]:
all_news.info()

In [None]:
import seaborn as sns

# checking for class imbalance
sns.set(rc={'figure.figsize':(11,5)})
sns.countplot(x='credibility', data=all_news)

From the above, it is clear that the dataset is balanced for both fake and real news

In [None]:
# checking for duplicate text

from hashlib import sha256
from tqdm import tqdm
list_ = [ ]
for text in tqdm(all_news['text']):
    hash_ = sha256(text.encode('utf-8')).hexdigest()
    list_.append(hash_)
all_news['hash'] = list_
pd.reset_option('max_colwidth')
all_news

In [None]:
t = all_news.groupby(['hash']).size().reset_index(name='count')
duplicate = t[t['count']>1]
print('there are ',duplicate.shape[0], 'duplicate texts')

In [None]:
# removing rows with duplicate text
all_news.drop_duplicates(subset='hash', inplace=True)
all_news.reset_index(inplace=True, drop=True)
all_news.drop('hash', axis=1, inplace=True)
all_news

In [None]:
# checking for class imbalance after dropping duplicates
sns.set(rc={'figure.figsize':(11,5)})
sns.countplot(x='credibility', data=all_news)

After dropping duplicates, the count of fake news has reduced, meaning most of the duplicate text were from fake news. However, the dataset set is still balanced

## Checking for Relationship between features(subject, date, title) and labels(credibility)

### Checking for relationship between news subject and news credibility

In [None]:
import seaborn as sns

# checking for relationship between credibility and subject
sns.set(rc={'figure.figsize':(11,5)})
sns.countplot(x='subject', data=all_news, hue='credibility')

* From the plot above, it is clear that real news are only centered around politicNews and worldnews subject areas, while fake news are centered around the other subject areas.
* This indicates that the subject area can help determine if news is fake or real

### Checking for relationship between news date and news credibility

In [None]:
#converting date string to datetime format

#removing url in date column
url_pattern = "http"
filter1 = all_news['date'].str.contains(url_pattern)
all_news = all_news[~filter1]
all_news

In [None]:
# removing other texts in date column
date_pattern = "Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec"
filter2 = all_news['date'].str.contains(date_pattern)
all_news = all_news[filter2]
all_news.reset_index(drop=True, inplace=True)

In [None]:
# converting date string to datetime format
all_news_copy = all_news.copy()
all_news_copy['date'] = pd.to_datetime(all_news_copy['date'])
all_news_copy.sort_values(by=['date'], inplace=True)
all_news_copy.reset_index(drop=True, inplace=True)
pd.reset_option('max_rows')
all_news_copy

In [None]:
# creating a dataframe of fake news counts by date
fake = all_news_copy[all_news_copy['credibility']==0]
fake['count'] = 0
fake = fake.groupby(['date'])['count'].count()
fake = pd.DataFrame(fake)
fake

In [None]:
# creating a dataframe of real news counts by date
real = all_news_copy[all_news_copy['credibility']==1]
real['count'] = 0
real = real.groupby(['date'])['count'].count()
real = pd.DataFrame(real)
real

In [None]:

# creating lineplots of fake and real news over time
sns.set(rc={'figure.figsize':(11,5)})
sns.lineplot(x=fake.index, y=fake['count'])
sns.lineplot(x=real.index, y=real['count'])

From the plot below, it seems there is some correlation between date a news article was created and its credibility. There was a sharp rise in fake news in later years, while real news dropped marginally.

### Checking for relationship between news title, news text and news credibility

In [None]:
# word cloud for real news title

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

real_words = ""
for line in all_news[all_news['credibility']==1]['title']:
    line = str(line) # change each line item to string
    tokens = line.split() # split line text into word tokens
    
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() # convert each token into lower case
    real_words += " ".join(tokens)+" "
    
wordcloud_ = WordCloud(stopwords=stopwords).generate(real_words)
plt.figure(figsize = (12, 16), facecolor = None) 
plt.axis('off')
plt.imshow(wordcloud_)

In [None]:
# word cloud for fake news title
fake_words = ""
for line in all_news[all_news['credibility']==0]['title']:
    line = str(line) # change each line item to string
    tokens = line.split() # split line text into word tokens
    
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() # convert each token into lower case
    fake_words += " ".join(tokens)+" "

wordcloud_ = WordCloud(stopwords=stopwords).generate(fake_words)
plt.figure(figsize = (12, 16), facecolor = None) 
plt.axis('off')
plt.imshow(wordcloud_)

Similar common words in both fake news and real news titles include: Trump, Obama, etc. But there are words like White House, US, North Korea, Russia, that are very common in real news titles but are not so common in fake news titles. On the other hand, there are words like Video, tweet, hillary, watch, gop, that are common in fake news titles, but are not so common in real news titles. This shows that there is some distinguishing feature between most real and fake news titles, and including titles in our analysis can add some information to our model

### Checking for relationship between news text and credibility

In [None]:
# word cloud for real news text
real_words = ""
for line in all_news[all_news['credibility']==1]['text']:
    line = str(line) # change each line item to string
    tokens = line.split() # split line text into word tokens
    
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() # convert each token into lower case
    real_words += " ".join(tokens)+" "
    
wordcloud_ = WordCloud(stopwords=stopwords).generate(real_words)
plt.figure(figsize = (12, 16), facecolor = None) 
plt.axis('off')
plt.imshow(wordcloud_)

In [None]:
# word cloud for fake news text

fake_words = ""
for line in all_news[all_news['credibility']==0]['text']:
    line = str(line) # change each line item to string
    tokens = line.split() # split line text into word tokens
    
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() # convert each token into lower case
    fake_words += " ".join(tokens)+" "

wordcloud_ = WordCloud(stopwords=stopwords).generate(fake_words)
plt.figure(figsize = (12, 16), facecolor = None) 
plt.axis('off')
plt.imshow(wordcloud_)

Although there are similar common words in both real news text and real news titles, there are still some distinguishing common words like people, featured image, percent, wednesday, thursday, tuesday, US, one, etc. This shows that the text of a news article is also a determinate factor in its credibility.

# Data Preprocessing

In [None]:
all_news['news_text'] = all_news['title'] + ' ' + all_news['text']+ ' ' + all_news['subject'] + ' ' + all_news['date']
all_news.drop(['title', 'text', 'subject', 'date'], axis=1, inplace=True)
all_news

In [None]:
pd.set_option('max_colwidth', None)
all_news = all_news[['news_text', 'credibility']]
all_news.sample()

The newstext column contains characters like brackets, @symbols, links, and a lot of other characters or texts that might not add much information to our model, so have to clean and preprocess the data to remove such characters before we fit the text to our model

In [None]:
import nltk
from nltk.corpus import stopwords 

nltk.download('words')
nltk.download('stopwords')
stop = stopwords.words('english')

In [None]:
import re
from bs4 import BeautifulSoup

def clean_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    text = remove_twitter_handles(text)
    text = remove_parenthesis(text)
    return text

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#remove twitter handles
def remove_twitter_handles(text):
    return re.sub(r'\(@([A-Za-z0-9_]+)\)', '', text)

# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)
# removing parenthesis
def remove_parenthesis(text):
    return re.sub(r'\([^()]*\)', '', text)

#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)


#Apply function on review column
all_news['news_text']=all_news['news_text'].apply(clean_text)
all_news.sample()

In [None]:

from nltk import word_tokenize
all_news_1 = all_news.copy()
all_news_1['news_text'] = all_news_1['news_text'].apply(lambda x: word_tokenize(str(x)))
all_news_1.sample()

In [None]:
from nltk.stem import SnowballStemmer

snowball = SnowballStemmer(language='english')
all_news_1['news_text'] = all_news_1['news_text'].apply(lambda x: [snowball.stem(y) for y in x])
all_news_1.sample()

In [None]:
all_news_1['news_text'] = all_news_1['news_text'].apply(lambda x: ' '.join(x))
all_news_1.sample()

# Feature Extraction and Model Training

## Using TF-IDF

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    all_news_1['news_text'],all_news_1['credibility'], 
    test_size=0.3, 
    stratify=all_news_1['credibility']
)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.fit_transform(X_test)

print("Train vector shape:",train_vectors.shape)
print("Test vector shape:", test_vectors.shape)


* there's a mismatch in test vector shape and train vector shape
* as a result, we need to reshape test vectors


In [None]:

import scipy
from scipy.sparse import csr_matrix

train_vectors = csr_matrix(train_vectors)
test_vectors = csr_matrix(test_vectors, shape = (test_vectors.shape[0], train_vectors.shape[1])) 
# creates a sparse matrix with the given shape
test_vectors

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


clf = MultinomialNB()
clf.fit(train_vectors, y_train)

y_pred = clf.predict(test_vectors)
print("Accuracy:", accuracy_score(y_pred, y_test))

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear', penalty='l1', C=100)
clf.fit(train_vectors, y_train)

y_pred = clf.predict(test_vectors)
print("Accuracy:", accuracy_score(y_pred, y_test))

Note: In previous versions of this notebook, I vectorized the data before I did the train_test split and I got an accuracy of about 95%, but I received feedback that doing so before splitting the data causes information from the training set to mix with that of the test set. After doing the train test split before vectorizing, I the accuracy has reduced drastically for each of the regression models used. This confirms that the initial 95% accuracy was not really representative of the actual model performance.

## Using Word Embeddings

### Creating Word Embedding from Scratch

In [None]:
all_news_2 = all_news.copy()
all_news_2.sample()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_news_2['news_text']
                                                    ,all_news_2['credibility'], test_size=0.30, random_state=1)

In [None]:
import tensorflow 
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train) 
# updates internal vocabulary with words in train set
# each word is represented with an integer based on the frequency of the word in the entire train data
# words with a higher frequency gets lower integer values


sequences = tokenizer.texts_to_sequences(X_train)
# creates a sequence of integers that represents each word in each row of train data


word_index = tokenizer.word_index 
# creates a dictionary of unique words and their integer values 

vocab_size = len(word_index)
print('Training vocabulary size: ', vocab_size)

test_tokens = Tokenizer()
test_tokens.fit_on_texts(X_test)
test_sequences = test_tokens.texts_to_sequences(X_test)
test_word_index = test_tokens.word_index
test_vocab_size = len(test_word_index)
print('Testing vocabulary size: ', test_vocab_size )

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(sequences, padding = 'post')
X_test = pad_sequences(test_sequences, padding = 'post')

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

embedding_dim=200

model = Sequential([
  Embedding(vocab_size + 1, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

model.summary()

The Embedding layer that maps from integer indices (which stand for specific words) to dense vectors (their embeddings). The dimensionality (or width) of the embedding is a parameter you can experiment with to see what works well for your problem.

In [None]:
model.compile(optimizer='adam',
              loss=tensorflow.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=15
    )


In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

### Using Pre-trained Word Embeddings: GloVe

In [None]:
glove_dir = '../input/glove6b100dtxt/glove.6B.100d.txt'
embedding_dimension = 100 

embeddings_index = {}
f = open(glove_dir)
print('Loading GloVe from:', glove_dir,'...', end='')
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\n Proceeding with Embedding Matrix...", end="")

# for train data
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dimension))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


# for test data
test_embedding_matrix = np.random.random((len(test_word_index) + 1, embedding_dimension))
for word, i in test_word_index.items():
    test_embedding_vector = embeddings_index.get(word)
    if test_embedding_vector is not None:
        test_embedding_matrix[i] = test_embedding_vector
print(" Completed!")

In [None]:

model = Sequential([
  Embedding(vocab_size + 1, embedding_dimension, weights = [embedding_matrix], 
            name="embedding"),
  GlobalAveragePooling1D(),
  Dense(32, activation='relu'),
  Dense(16, activation='relu'),
  Dense(1)
])

model.summary()


In [None]:
model.compile(optimizer='adam',
              loss=tensorflow.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
history_glove = model.fit(X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=15)

In [None]:
import matplotlib.pyplot as plt

plt.plot(history_glove.history['accuracy'])
plt.plot(history_glove.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Next Step

Although the accuracies were high in both models, the validation accuracies did not improve much and were lower which shows both models were overfitting. The next step would be to improve the architecture of the neural networks to see if the validation accuracies improve.