# Read True and Fake CSV

In [None]:
import numpy as np
import pandas as pd

In [None]:
true_df = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')
true_df.head()  # 21417 rows × 4 columns

In [None]:
fake_df = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
fake_df.head()  # 23481 rows × 4 columns

# Add Label Column and create concat dataframe

In [None]:
## Assign labels for the news type
true_df['news_type'] = 1 
fake_df['news_type'] = 0

In [None]:
## concate both dataframes
news_df = pd.concat([true_df,fake_df],ignore_index=True)
news_df = news_df.sample(frac=1).reset_index(drop=True)
train_df = news_df # for future use
train_df.head()  # 44898 rows × 5 columns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

final_labels = np.array(news_df['news_type'])

# Preprocess Text

In [None]:
import string
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

In [None]:
news_df.iloc[0]['text']

In [None]:
## Preprocess the Text

wordnet_lemma = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # lower
    tokens = [w.lower() for w in tokens]
    
    # stemming
    stem_ls = [PorterStemmer().stem(w) for w in tokens]
    
    # lemmatization
    lemma_ls = [wordnet_lemma.lemmatize(w) for w in stem_ls]
    
    # remove punctuation
    stripped_ls = [w for w in lemma_ls if not w in string.punctuation]

    # remove tokens that are not alphabetic or numeric
    words = [word for word in stripped_ls if word.isalpha() or word.isnumeric()]
    
    # removing stopwords
    words = [w for w in words if not w in stop_words]
    
    return ' '.join(words)

preprocess_text(news_df.iloc[0]['text'])

In [None]:
%%time
news_df['text'] = news_df['text'].apply(preprocess_text)

In [None]:
news_df.head()

## WORDCLOUD for True News

In [None]:
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt 

plt.figure(figsize=(15,15))

wc_true = WordCloud(max_words = 2000, width=1000, height=500, stopwords= STOPWORDS).generate(' '.join(news_df[news_df['news_type']==1].text))
plt.imshow(wc_true, interpolation = 'bilinear')

## WORDCLOUD for Fake News

In [None]:
plt.figure(figsize=(15,15))

wc_fake = WordCloud(max_words = 2000, width=1000, height=500, stopwords= STOPWORDS).generate(' '.join(news_df[news_df['news_type']==0].text))
plt.imshow(wc_fake, interpolation = 'bilinear')

# Case 1 : Encoding words to numbers using Bag of Words with sklearn:SVM and sklearn:RandomForest Classifier

### Bag Of Words

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

vectorize = CountVectorizer(max_features=200,ngram_range=(1,3))
data_1 = vectorize.fit_transform(news_df['text']).toarray()

print("Data Case 1 : \n",data_1)
print("Data Case 1 Shape : \n",data_1.shape)

print("Label : \n",final_labels)
print("Label Shape : \n",final_labels.shape)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data_1,final_labels,test_size=0.2,random_state=777)

### sklearn : SVM Classifier

In [None]:
%%time
from sklearn.svm import LinearSVC

print("\nSVM Classifier : \n")
model_svc = LinearSVC().fit(x_train, y_train)
y_pred = model_svc.predict(x_test)

print("Confusion Matrix : \n",confusion_matrix(y_test,y_pred))
print("\n\nClassification Report : \n", classification_report(y_test,y_pred))
print("\n\nAccuracy : ",model_svc.score(x_test,y_test))

### sklearn : RandomForest Classifier

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

model_rfc = RandomForestClassifier(n_estimators=300,random_state=40).fit(x_train, y_train)
y_pred = model_rfc.predict(x_test)

print("\nRandomForestClassifier : \n")
print("Confusion Matrix : \n",confusion_matrix(y_test,y_pred))
print("\nClassification Report : \n", classification_report(y_test,y_pred))
print("\nAccuracy : ",model_rfc.score(x_test,y_test))

# Case 2 : TF-IDF with XGBoost and LightGBM Classifier

### TF-TDF 

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(lowercase=False, stop_words='english')
data_2 = tfidf.fit_transform(news_df['text'])

print("Data Case 2 : \n",data_2)
print("Data Case 2 Shape : \n",data_2.shape)

print("Label : \n",final_labels)
print("Label Shape : \n",final_labels.shape)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data_1,final_labels,test_size=0.2,random_state=777)

### XGBoost Classifier

In [None]:
%%time
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

model_xgb = XGBClassifier(eval_metric='rmse', use_label_encode=False).fit(x_train, y_train)
y_pred = model_xgb.predict(x_test)

print("\nXGBoost Classifier : \n")
print("Confusion Matrix : \n",confusion_matrix(y_test,y_pred))
print("\nClassification Report : \n", classification_report(y_test,y_pred))
print("\nAccuracy : ",accuracy_score(y_pred,y_test))

### LightGBM Classifier

In [None]:
%%time
from lightgbm import LGBMClassifier

x_train, x_val, y_train, y_val = train_test_split(x_train,y_train,test_size=0.2,random_state=777)

print("\nLightGBM Classifier : \n")
model_svc = LGBMClassifier(n_estimators = 300).fit(x_train, y_train, early_stopping_rounds=100, eval_metric='accuracy',eval_set=[(x_val,y_val)])
y_pred = model_svc.predict(x_test)

print("Confusion Matrix : \n",confusion_matrix(y_test,y_pred))
print("\n\nClassification Report : \n", classification_report(y_test,y_pred))
print("\n\nAccuracy : ",accuracy_score(y_pred,y_test))

# Case 3 :  Pre-trained GloVe Embedding and Tensorflow LSTM

In [None]:
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

### Tokenization, Padding

In [None]:
%%time
# Tokenization : Representing each word by a vector
from tensorflow.keras.preprocessing import text, sequence

max_features = 10000
maxlen = 300 # keep all text to 300, add padding for text len < 300 and truncating long ones

X = news_df['text'].values
tokenizer = text.Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
data_3 = sequence.pad_sequences(X,maxlen=maxlen)

print("Data Case 3 : \n",data_3)
print("Data Case 3 Shape : \n",data_3.shape)

print("Label : \n",final_labels)
print("Label Shape : \n",final_labels.shape)

### Introducing GloVe Embedding and creating Embedding Matrix

In [None]:
%%time

Embedding_file ='../input/glove-twitter/glove.twitter.27B.100d.txt'

def get_coeffs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')
embed_id = dict(get_coeffs(*o.rstrip().rsplit(' ')) for o in open(Embedding_file, encoding="utf8"))

embeds = np.stack(embed_id.values())
emb_mean, emb_std = embeds.mean(), embeds.std()
embeds_len = embeds.shape[1]

word_index = tokenizer.word_index # mapping of original word to number
nb_words = min(max_features, len(word_index))

embed_mat = np.random.normal(emb_mean, emb_std, (nb_words, embeds_len))
for word, i in word_index.items():
    if i >= max_features: continue
    embed_vec = embed_id.get(word)
    if embed_vec is not None: embed_mat[i] = embed_vec
        
print("Embedding Matrix : \n",embed_mat)
print("Embedding Matrix size : ",embed_mat.shape)

### Reference : https://www.kaggle.com/madz2000/nlp-using-glove-embeddings-99-87-accuracy --- to understand glove embedding

### Build Model BiLSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers, Input, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Masking, Bidirectional, Activation, BatchNormalization
from tensorflow.keras.regularizers import l2, l1_l2
from tensorflow.keras.callbacks import EarlyStopping

optimizer=optimizers.Adam(clipnorm=0.25,lr=0.0005)

def BiLSTM_Model():
    model = Sequential([
    Embedding(max_features, output_dim = embeds_len, weights = [embed_mat], input_length=maxlen, trainable=False),
    Bidirectional(LSTM(64,return_sequences=True,recurrent_regularizer=l2(2e-4))),
    Dropout(0.1),
    Bidirectional(LSTM(32,recurrent_regularizer=l2(2e-4),return_sequences=False)),
    Dense(16,activation='relu'),
    Dense(1,kernel_regularizer=l1_l2(1e-4, 2e-4),activation='sigmoid')
    ])
    model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return model


model_bilstm = BiLSTM_Model()
model_bilstm.summary()
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto')

### Train Model

In [None]:
x_train,x_test,y_train,y_test = train_test_split(data_3,final_labels,test_size=0.2,random_state = 777)
epochs = 10
batch_size = 128

x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.2,random_state = 777)

history = model_bilstm.fit(x_train, y_train, batch_size = batch_size , validation_data = (x_val,y_val) , epochs = epochs , callbacks = [early_stopping])

### Evaluate Model

In [None]:
loss,acc = model_bilstm.evaluate(x_test,y_test)

print("\n\nTest Data Loss : ",loss*100)
print("\nTest Data Accuracy : ",acc*100)

y_pred = (model_bilstm.predict(x_test) > 0.5).astype("int32")

print("\nConfusion Matrix : \n",confusion_matrix(y_test,y_pred))
print("\nClassification Report : \n", classification_report(y_test,y_pred))

# Case 4 : BERT and RoBERTa

In [None]:
import torch

is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
print(device)

### Initialize Pretrained Models of BERT and RoBERTa

In [None]:
%%time
### Initialize Pretrained Models of BERT and RoBERTa

sentences = train_df['text'].values

from transformers import BertForSequenceClassification, BertTokenizer, RobertaForSequenceClassification, RobertaTokenizer, AdamW

model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2,output_attentions=False,output_hidden_states=False)
# 'bert-base-uncased' : 12 layer BERT model with uncased vocab
# num_labels : 2 labels for binary classification
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert.cuda()

print("\n\nBERT Model : \n\n",model_bert)

model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base',num_labels=2,output_attentions=False,output_hidden_states=False)
# 'roberta-base' : 12 layer, 768 hidden, 12 heads, 125M params RoBERTa using BERT-base architecture
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
model_roberta.cuda()

print("\n\nRoBERTa Model : \n\n",model_roberta)

### Tokenization : encode_plus method from tokenizer_bert and tokenizer_roberta 

In [None]:
%%time
### Tokenization : encode_plus method from tokenizer_bert and tokenizer_roberta

# encode plus : tokenize sentence, prepand [CLS] to start, append [SEP] to end, 
# map token to their ID, Pad or truncate the sentence to max_len, create attention masks for [PAD] tokens

inputID_bert = []
attentionMask_bert = []

inputID_roberta = []
attentionMask_roberta = []

sentenceID = []
count = 0

for text in sentences:
    
    enc_dict_bert = tokenizer_bert.encode_plus(text,add_special_tokens=True,max_length=120,pad_to_max_length=True, return_attention_mask=True,return_tensors='pt')
    enc_dict_roberta = tokenizer_roberta.encode_plus(text,add_special_tokens=True,max_length=120,pad_to_max_length=True, return_attention_mask=True,return_tensors='pt')
    
    # max_length : Pad and truncate all texts
    # return_attention_mask : construct attention masks
    # return_tensors : 'pt' : pytorch tensor
    
    inputID_bert.append(enc_dict_bert['input_ids'])
    inputID_roberta.append(enc_dict_roberta['input_ids']) # added encoded text as ID to the list
    
    attentionMask_bert.append(enc_dict_bert['attention_mask']) # added attention mask to the list
    attentionMask_roberta.append(enc_dict_roberta['attention_mask']) # that simply differs padding from non-padding

    sentenceID.append(count)
    count = count + 1
    
# convert lists to tensor

inputID_bert = torch.cat(inputID_bert,dim=0)
inputID_roberta = torch.cat(inputID_roberta,dim=0)
attentionMask_bert = torch.cat(attentionMask_bert,dim=0)
attentionMask_roberta = torch.cat(attentionMask_roberta,dim=0)

labels = torch.tensor(final_labels)
sentenceID = torch.tensor(sentenceID)

print('\nOriginal: \n', sentences[0])
print('\nToken IDs BERT: \n', inputID_bert[0])
print('\nToken IDs RoBERTa: \n', inputID_roberta[0])

### Reference : https://www.kaggle.com/jaskaransingh/fake-news-classification-bert-roberta --- for tokenization

### Create DataSet and DataLoader

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Remove sentence id from TensorDataset after train validation test split
def sid_remove_from_tensordataset(datatensor): 
    
    inputID = []
    attentionMask = []
    label = []
    
    for sid,iid,amask,l in datatensor:
        inputID.append(iid.tolist())
        attentionMask.append(amask.tolist())
        label.append(l.tolist())
    
    inputID = torch.tensor(inputID)
    attentionMask = torch.tensor(attentionMask)
    label = torch.tensor(label)
    
    return TensorDataset(inputID,attentionMask,label)
    
# Get DataSetLoaders
def get_loaders(dataset,batch_size,b):

    """
    return the train, validation and test set loaders
    """  
    #dataset = torch.utils.data.TensorDataset(data_tr, labels_tr)
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    #print("\nTrain DataSet Size :",train_size)
    #print("\nValidation DataSet Size :",val_size)
    #print("\nTest DataSet Size :",test_size)
    train_dataset, validation_dataset,test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])
    
    if(b==1): # sid remove only in BERT model : b=1
        train_dataset = sid_remove_from_tensordataset(train_dataset)
        validation_dataset = sid_remove_from_tensordataset(validation_dataset)
        test_dataset = sid_remove_from_tensordataset(test_dataset)

    train_loader = DataLoader(train_dataset, batch_size=batch_size,sampler=RandomSampler(train_dataset))
    valid_loader = DataLoader(validation_dataset, batch_size=batch_size,sampler=SequentialSampler(validation_dataset))    
    test_loader = DataLoader(test_dataset, batch_size=batch_size,sampler=SequentialSampler(test_dataset))

    return train_loader, valid_loader, test_loader


### Train and Evaluate Model

In [None]:
def get_accuracy(y_pred, y_test):
    y_pred_flat = np.argmax(y_pred, axis=1).flatten()
    y_test_flat = y_test.flatten()
    return np.sum(y_pred_flat == y_test_flat) / len(y_test_flat)

In [None]:
def train_model(model, optimizer, train_loader):
  
    model.train()

    epoch_loss = 0
    epoch_acc = 0

    for iid, amask, labels in train_loader:
        
        iid, amask, labels = iid.to(device), amask.to(device), labels.to(device)
        model.zero_grad()
        loss,outputs = model(iid,token_type_ids=None, attention_mask=amask,labels=labels, return_dict=False)  
        #torch.set_default_tensor_type(torch.FloatTensor)
        
        #x = torch.tensor(x, dtype=torch.float32)
        #labels=torch.tensor(labels, dtype= torch.float32)
        #loss = criterion(outputs, labels)
        #loss = criterion(outputs.squeeze(), labels.float32)
        epoch_loss += loss.item()
        loss.backward()
        #nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_acc += get_accuracy(outputs.detach().cpu().numpy(),labels.to('cpu').numpy())

    train_loss =  epoch_loss / len(train_loader)
    train_acc = epoch_acc / len(train_loader)  
    return train_loss, train_acc

In [None]:
def evaluate_model(model, loader):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    y_true=[]
    y_pred=[]
    
    with torch.no_grad():
    
        for iid, amask, labels in loader:
            iid, amask, labels = iid.to(device), amask.to(device), labels.to(device)

            loss,outputs = model(iid,token_type_ids=None, attention_mask=amask, labels=labels, return_dict=False)
            #torch.set_default_tensor_type(torch.FloatTensor)
            labels=torch.tensor(labels, dtype= torch.float32)
            
            #e_loss = criterion(outputs.squeeze(), labels.float32)
            #e_loss = criterion(outputs,labels)
            epoch_loss += loss.item()
            
            epoch_acc += get_accuracy(outputs.detach().cpu().numpy(),labels.to('cpu').numpy())
           
            y_true.append(labels.to('cpu').numpy())
            y_pred.append(outputs.detach().cpu().numpy())
            
        
    loss =  epoch_loss / len(loader)
    acc = epoch_acc / len(loader)  
    return loss, acc, y_pred, y_true

### Run Model

In [None]:
def run_model(model, train_loader, validate_loader, test_loader, epochs, batch_size, optimizer):
   
    for epoch in range(epochs):
        train_loss, train_acc = train_model(model, optimizer, train_loader)
        valid_loss, valid_acc, _, _ = evaluate_model(model, validate_loader)

        print(f'Epoch: {epoch+1:02}')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    ##Evaluate the test accuracy

    test_loss, test_acc, y_pred, y_true = evaluate_model(model, test_loader)
    print(f'\nTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
    
    flat_pred = np.concatenate(y_pred,axis=0)
    flat_pred = np.argmax(flat_pred,axis=1).flatten()
    flat_true = np.concatenate(y_true, axis=0)
    print("\nConfusion Matrix : \n",confusion_matrix(flat_true,flat_pred))
    print("\nClassification Report : \n", classification_report(flat_true,flat_pred))
    
NUM_EPOCHS = 1
batch_size = 20

In [None]:
%%time

# RUN BERT Model
print("\n\nBERT Model :\n")
dataset_bert = TensorDataset(sentenceID, inputID_bert, attentionMask_bert, labels)
train_loader_bert, validation_loader_bert, test_loader_bert = get_loaders(dataset_bert, batch_size, 1)
optimizer_bert = AdamW(model_bert.parameters(),lr=5e-5,eps=1e-8)

run_model(model_bert, train_loader_bert, validation_loader_bert, test_loader_bert, NUM_EPOCHS, batch_size, optimizer_bert)

In [None]:
%%time

# RUN RoBERTa Model
print("\n\nRoBERTa Model :\n")
dataset_roberta = TensorDataset(inputID_roberta, attentionMask_roberta, labels)
train_loader_roberta, validation_loader_roberta, test_loader_roberta = get_loaders(dataset_roberta, batch_size, 0)
optimizer_roberta = AdamW(model_roberta.parameters(),lr=5e-5,eps=1e-8)

run_model(model_roberta, train_loader_roberta, validation_loader_roberta, test_loader_roberta, NUM_EPOCHS, batch_size, optimizer_roberta)

# Summary

Case 1 : 
Bag of Words : Set of vectors containing count of word occurences, a simple and flexible approach in Text Classification.
Sklearn Library : SVM and RandomForest : Optimal Classifiers for Binary Classification.

Case 2 :
TF-IDF : It assigns a value to a term according to its importance in the text scaled by its importance across all the texts in the data. A popular approach in NLP.
XGBoost and LightGBM : Both are based on Gradient Boosted Decision Trees. In XGBoost, trees grow depth-wise and in LightGBM, trees grow leaf-wise. Both models had great success in enterprise applications and data science competitions. XGBoost is extremely powerful, though model training is faster in LightGBM.

Case 3 :
Pre-trained GloVe Embedding : GloVe = Global vectors for word representation. It is an unsupervised algorithm developed by Standford for generating word embeddings by aggregating global word-word co-occurence matrix from a corpus, which gives semantic relationships between words. Here, I have user Pretrained Word Vector of Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors, 1.42 GB) from https://nlp.stanford.edu/projects/glove/
Tensorflow Framework : Bi-Directional LSTM : LSTM is classic model used for NLP tasks

Case 4 :
PyTorch Framework : HuggingFace transformers Library
BERT : Google's BERT (October-2018) is the transformer based method for NLP, outperforming state-of-the-art on several tasks such as QnA, language inference. It is a pre-trained deep Bi-directional Encoder Representation from transformer with Masked Language Modelling (MLM) and Next Sentence Prediction (NSP).
RoBERTa : Facebook's RoBERTa (July-2019), robustly optimized BERT approach, advancing the state-of-the-art in self-supervised systems. It is a BERT without Next Sentence Prediction (NSP). To improve training procedure, RoBERTa removes the Next Sentence Prediction (NSP) task from BERT's pre-training and dynamic masking so that the masked token changes during training epochs.

The Most Preferred Model : From these 4 cases, currently the RoBERTa model is the most preferred one, as it is the optimized BERT approach.