# 6050 Project 



Link to access GloVe files can be found here:  http://nlp.stanford.edu/data/glove.6B.zip

In [1]:
# import stuff
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0-preview is required
import tensorflow as tf
#from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
import keras
import pickle
import random

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = 'drive/MyDrive/DS_6050_Project/train.csv'
df = pd.read_csv(path)

In [5]:
#Number of Null Values that need to be deleted
len(df[df['text'].isnull()])

39

All of the values above were classified as fake news so want to make sure deleting these won't cause a problem.

In [6]:
#Looking at the distribution of data
print(len(df[df['label']==1]))
print(len(df[df['label']==0]))

10413
10387


I don't think losing these 39 rows will be a problem so I am going to drop them.

In [7]:
#Removing the null values
df = df[df['text'].notnull()]

In [8]:
# pull out the text and label, shuffles it and puts it in a list
def load_kagglefakenews(df):
    #load training data and put into arrays
    train_data = df['text'].values.tolist() #'text' column contains articles
    train_labels = df['label'].values.tolist() #'label' column contains labels

    #Randomly shuffle data and labels together
    combo = list(zip(train_data, train_labels))
    random.shuffle(combo)
    train_data, train_labels = zip(*combo)
    del df #clear up memory

    return np.asarray(train_data).tolist(), np.asarray(train_labels).tolist()

In [9]:
train_data, train_labels = load_kagglefakenews(df)

In [10]:
MAX_NB_WORDS=50000 #dictionary size
MAX_SEQUENCE_LENGTH=1500 #max word length of each individual article
EMBEDDING_DIM=300 #dimensionality of the embedding vector (50, 100, 200, 300)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

def tokenize_trainingdata(texts, labels):
    tokenizer.fit_on_texts(texts)
    pickle.dump(tokenizer, open('tokenizer.p', 'wb'))

    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = tf.keras.utils.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    labels = tf.keras.utils.to_categorical(labels, num_classes=len(set(labels)))

    return data, labels, word_index

#and run it
X, Y, word_index = tokenize_trainingdata(train_data, train_labels)

Found 250670 unique tokens.


In [11]:
train_data = X[:int(len(X)*0.9)]
train_labels = Y[:int(len(X)*0.9)]
test_data = X[int(len(X)*0.9):int(len(X)*0.95)]
test_labels = Y[int(len(X)*0.9):int(len(X)*0.95)]
valid_data = X[int(len(X)*0.95):]
valid_labels = Y[int(len(X)*0.95):]

In [12]:
def load_embeddings(word_index, embeddingsfile='drive/MyDrive/DS_6050_Project/glove.6B.%id.txt' %EMBEDDING_DIM):
    embeddings_index = {}
    f = open(embeddingsfile, 'r', encoding='utf8')
    for line in f:
        #here we parse the data from the file
        values = line.split(' ') #split the line by spaces
        word = values[0] #each line starts with the word
        coefs = np.asarray(values[1:], dtype='float32') #the rest of the line is the vector
        embeddings_index[word] = coefs #put into embedding dictionary
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix
    
#and build the embedding matrix
embedding_matrix = load_embeddings(word_index)

Found 400000 word vectors.


In [13]:
embedding_matrix.shape

(250671, 300)

In [14]:
model = keras.Sequential([

     keras.layers.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length = MAX_SEQUENCE_LENGTH, trainable=False),
     keras.layers.Conv1D(64, 5, activation='relu'),
     keras.layers.MaxPool1D(5),
     keras.layers.Conv1D(128, 3, activation='relu'), 
     keras.layers.MaxPool1D(5),
     keras.layers.Conv1D(256, 2, activation='relu'),
     keras.layers.GlobalAveragePooling1D(),
     keras.layers.Dense(2048, activation='relu'),
     keras.layers.Dropout(0.5),
     keras.layers.Dense(512, activation='relu'),
     keras.layers.Dropout(0.5),
     keras.layers.Dense(2, activation='softmax')
     ])


In [15]:
model.build(input_shape = embedding_matrix.shape[1])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         75201300  
                                                                 
 conv1d (Conv1D)             (None, 1496, 64)          96064     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 299, 64)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 297, 128)          24704     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 59, 128)          0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 58, 256)           6

In [17]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics='accuracy')

In [18]:
model.fit(train_data, train_labels,
          validation_data=(valid_data, valid_labels),
          epochs=25, batch_size=64)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f3e1db53810>

In [19]:
model.evaluate(test_data, test_labels)



[0.19483265280723572, 0.9720616340637207]

**TRYING TO USE BERT**

In [None]:
######### Trying something new https://towardsdatascience.com/a-beginners-guide-to-use-bert-for-the-first-time-2e99b8c5423

In [21]:
#!pip install transformers

In [22]:
from transformers import DistilBertTokenizerFast, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [29]:
#Resetting the train_data so they are no longer GloVe Encodings
train_data, train_labels = load_kagglefakenews(df)

In [30]:
train_data_2 = train_data[:int(len(train_data)*0.9)]
train_labels_2 = train_labels[:int(len(train_labels)*0.9)]
test_data = train_data[int(len(train_data)*0.9):int(len(train_data)*0.95)]
test_labels = train_labels[int(len(train_labels)*0.9):int(len(train_labels)*0.95)]
valid_data = train_data[int(len(train_data)*0.95):]
valid_labels = train_labels[int(len(train_labels)*0.95):]

In [31]:
train_encodings = tokenizer(train_data_2, truncation=True, padding=True)

In [32]:
val_encodings = tokenizer(valid_data, truncation=True, padding=True)
test_encodings = tokenizer(test_data, truncation=True, padding=True)

In [33]:
import torch

In [34]:
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_labels_2)
val_dataset = FakeNewsDataset(val_encodings, valid_labels)
test_dataset = FakeNewsDataset(test_encodings, test_labels)

In [None]:
####### Trying new method using https://developer.habana.ai/tutorials/pytorch/distilbert-sequence-classification-with-imdb-reviews/

In [35]:
def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

In [36]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="test-2",
    #evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    #load_best_model_at_end=True,
    #metric_for_best_model="accuracy"
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

Step,Training Loss
500,0.1477
1000,0.0231


Saving model checkpoint to test-2/checkpoint-500
Configuration saved in test-2/checkpoint-500/config.json
Model weights saved in test-2/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test-2/checkpoint-1000
Configuration saved in test-2/checkpoint-1000/config.json
Model weights saved in test-2/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1168, training_loss=0.07778831418246439, metrics={'train_runtime': 507.5915, 'train_samples_per_second': 36.809, 'train_steps_per_second': 2.301, 'total_flos': 2475020876488704.0, 'train_loss': 0.07778831418246439, 'epoch': 1.0})

In [38]:
#!pip install datasets

In [39]:
from datasets import load_metric
metric = load_metric("accuracy")

  


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [40]:
#Validation Accuracy
metrics = trainer.evaluate()
metrics["eval_samples"] = len(valid_data)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


***** Running Evaluation *****
  Num examples = 1039
  Batch size = 16


***** eval metrics *****
  epoch                   =        1.0
  eval_accuracy           =     0.9971
  eval_loss               =     0.0102
  eval_runtime            = 0:00:09.21
  eval_samples            =       1039
  eval_samples_per_second =    112.707
  eval_steps_per_second   =      7.051


In [41]:
#Test Accuracy
metrics = trainer.evaluate(test_dataset)
metrics["eval_samples"] = len(test_dataset)
trainer.log_metrics("test", metrics)
trainer.save_metrics("test", metrics)

***** Running Evaluation *****
  Num examples = 1038
  Batch size = 16


***** test metrics *****
  epoch                   =        1.0
  eval_accuracy           =      0.999
  eval_loss               =     0.0036
  eval_runtime            = 0:00:09.08
  eval_samples            =       1038
  eval_samples_per_second =    114.214
  eval_steps_per_second   =      7.152
