# 6050 Project 



Link to access GloVe files can be found here:  http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
# import stuff
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0-preview is required
import tensorflow as tf
#from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
import keras
import pickle
import random

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Train Data
path = 'drive/MyDrive/DS_6050_Project/train.csv'
df = pd.read_csv(path)

In [None]:
#TestData
path2 = 'drive/MyDrive/DS_6050_Project/test_with_labels.xlsx'
df2 = pd.read_excel(path2)

In [None]:
df3 = pd.concat([df, df2])

In [None]:
#Number of Null Values that need to be deleted
len(df[df['text'].isnull()])

39

All of the values above were classified as fake news so want to make sure deleting these won't cause a problem.

In [None]:
#Looking at the distribution of data
print(len(df[df['label']==1]))
print(len(df[df['label']==0]))

10413
10387


I don't think losing these 39 rows will be a problem so I am going to drop them.

In [None]:
# #Removing the null values for train
# df = df[df['text'].notnull()]

In [None]:
# #Removing the null values for test
# df2 = df2[df2['text'].notnull()]

In [None]:
# #Removing the null values for test
df3 = df3[df3['text'].notnull()]

In [None]:
# pull out the text and label, shuffles it and puts it in a list
def load_kagglefakenews(df):
    #load training data and put into arrays
    train_data = df['text'].values.tolist() #'text' column contains articles
    train_labels = df['label'].values.tolist() #'label' column contains labels

    #Randomly shuffle data and labels together
    combo = list(zip(train_data, train_labels))
    random.shuffle(combo)
    train_data, train_labels = zip(*combo)
    del df #clear up memory

    return np.asarray(train_data).tolist(), np.asarray(train_labels).tolist()

In [None]:
# train_data, train_labels = load_kagglefakenews(df)

In [None]:
# test_data, test_labels = load_kagglefakenews(df2)

In [None]:
train_data, train_labels = load_kagglefakenews(df3)

In [None]:
MAX_NB_WORDS=50000 #dictionary size
MAX_SEQUENCE_LENGTH=1500 #max word length of each individual article
EMBEDDING_DIM=300 #dimensionality of the embedding vector (50, 100, 200, 300)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

def tokenize_trainingdata(texts, labels):
    tokenizer.fit_on_texts(texts)
    pickle.dump(tokenizer, open('tokenizer.p', 'wb'))

    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = tf.keras.utils.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    labels = tf.keras.utils.to_categorical(labels, num_classes=len(set(labels)))

    return data, labels, word_index

#and run it
X, Y, word_index = tokenize_trainingdata(train_data, train_labels)
#A, B, word_index2 = tokenize_trainingdata(test_data, test_labels)

Found 283796 unique tokens.


In [None]:
# #Use all of the train data
# train_data = X
# train_labels = Y

In [None]:
#Use all of the test data
# test_data = A
# test_labels = B

In [None]:
train_data = X[:int(len(X)*0.70)]
train_labels = Y[:int(len(X)*0.70)]
test_data = X[int(len(X)*0.7):int(len(X)*0.90)]
test_labels = Y[int(len(X)*0.7):int(len(X)*0.90)]
valid_data = X[int(len(X)*0.90):]
valid_labels = Y[int(len(X)*0.90):]

In [None]:
# train_data = X[:int(len(X)*0.90)]
# train_labels = Y[:int(len(X)*0.90)]
# # test_data = X[int(len(X)*0.9):int(len(X)*0.95)]
# # test_labels = Y[int(len(X)*0.9):int(len(X)*0.95)]
# valid_data = X[int(len(X)*0.90):]
# valid_labels = Y[int(len(X)*0.90):]

In [None]:
def load_embeddings(word_index, embeddingsfile='drive/MyDrive/DS_6050_Project/glove.6B.%id.txt' %EMBEDDING_DIM):
    embeddings_index = {}
    f = open(embeddingsfile, 'r', encoding='utf8')
    for line in f:
        #here we parse the data from the file
        values = line.split(' ') #split the line by spaces
        word = values[0] #each line starts with the word
        coefs = np.asarray(values[1:], dtype='float32') #the rest of the line is the vector
        embeddings_index[word] = coefs #put into embedding dictionary
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix
    
#and build the embedding matrix
embedding_matrix = load_embeddings(word_index)

Found 400000 word vectors.


In [None]:
embedding_matrix.shape

(250671, 300)

In [None]:
model = keras.Sequential([

     keras.layers.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length = MAX_SEQUENCE_LENGTH, trainable=False),
     keras.layers.Conv1D(64, 5, activation='relu'),
     keras.layers.MaxPool1D(5),
     keras.layers.Conv1D(128, 3, activation='relu'), 
     keras.layers.MaxPool1D(5),
     keras.layers.Conv1D(256, 2, activation='relu'),
     keras.layers.GlobalAveragePooling1D(),
     keras.layers.Dense(2048, activation='relu'),
     keras.layers.Dropout(0.5),
     keras.layers.Dense(512, activation='relu'),
     keras.layers.Dropout(0.5),
     keras.layers.Dense(2, activation='softmax')
    #  keras.layers.Dense(1, activation='sigmoid')
     ])


In [None]:
model.build(input_shape = embedding_matrix.shape[1])

In [None]:
# values = []
# for v in test_labels:
#   values.append(v[1])


In [None]:
# print(values)

In [None]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 1500, 300)         85139100  
                                                                 
 conv1d_12 (Conv1D)          (None, 1496, 64)          96064     
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 299, 64)          0         
 1D)                                                             
                                                                 
 conv1d_13 (Conv1D)          (None, 297, 128)          24704     
                                                                 
 max_pooling1d_9 (MaxPooling  (None, 59, 128)          0         
 1D)                                                             
                                                                 
 conv1d_14 (Conv1D)          (None, 58, 256)          

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics='accuracy')

In [None]:
model.fit(train_data, train_labels,
          validation_data=(valid_data, valid_labels),
          epochs=25, batch_size=64)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f025e8b6750>

In [None]:
model.evaluate(test_data, test_labels)



[0.9029831290245056, 0.8576107621192932]

**TRYING TO USE BERT**

In [None]:
######### Trying something new https://towardsdatascience.com/a-beginners-guide-to-use-bert-for-the-first-time-2e99b8c5423

In [None]:
# !pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 46.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 59.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
from transformers import DistilBertTokenizerFast, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
#Resetting the train_data so they are no longer GloVe Encodings
train_data, train_labels = load_kagglefakenews(df3)

In [None]:
train_data_2 = train_data[:int(len(train_data)*0.7)]
train_labels_2 = train_labels[:int(len(train_labels)*0.7)]
test_data = train_data[int(len(train_data)*0.7):int(len(train_data)*0.9)]
test_labels = train_labels[int(len(train_labels)*0.7):int(len(train_labels)*0.9)]
valid_data = train_data[int(len(train_data)*0.9):]
valid_labels = train_labels[int(len(train_labels)*0.9):]

In [None]:
train_encodings = tokenizer(train_data_2, truncation=True, padding=True)

In [None]:
val_encodings = tokenizer(valid_data, truncation=True, padding=True)
test_encodings = tokenizer(test_data, truncation=True, padding=True)

In [None]:
import torch

In [None]:
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_labels_2)
val_dataset = FakeNewsDataset(val_encodings, valid_labels)
test_dataset = FakeNewsDataset(test_encodings, test_labels)

In [None]:
####### Trying new method using https://developer.habana.ai/tutorials/pytorch/distilbert-sequence-classification-with-imdb-reviews/

In [None]:
def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="test-2",
    #evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    #load_best_model_at_end=True,
    #metric_for_best_model="accuracy"
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

Step,Training Loss
500,0.3387
1000,0.2631


Saving model checkpoint to test-2/checkpoint-500
Configuration saved in test-2/checkpoint-500/config.json
Model weights saved in test-2/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test-2/checkpoint-1000
Configuration saved in test-2/checkpoint-1000/config.json
Model weights saved in test-2/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1136, training_loss=0.29337386346199146, metrics={'train_runtime': 492.4006, 'train_samples_per_second': 36.891, 'train_steps_per_second': 2.307, 'total_flos': 2406270296586240.0, 'train_loss': 0.29337386346199146, 'epoch': 1.0})

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 4.1 MB/s 
[?25hCollecting dill<0.3.6
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 4.9 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 38.6 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 43.5 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 58.2 MB/s 
Collecting multiprocess
  Downloading multi

In [None]:
from datasets import load_metric
metric = load_metric("accuracy")

  


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
#Validation Accuracy
metrics = trainer.evaluate()
metrics["eval_samples"] = len(valid_data)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


***** Running Evaluation *****
  Num examples = 2596
  Batch size = 16


***** eval metrics *****
  epoch                   =        1.0
  eval_accuracy           =     0.9237
  eval_loss               =     0.2413
  eval_runtime            = 0:00:23.22
  eval_samples            =       2596
  eval_samples_per_second =    111.787
  eval_steps_per_second   =      7.019


In [None]:
#Test Accuracy
metrics = trainer.evaluate(test_dataset)
metrics["eval_samples"] = len(test_dataset)
trainer.log_metrics("test", metrics)
trainer.save_metrics("test", metrics)

***** Running Evaluation *****
  Num examples = 5190
  Batch size = 16


***** test metrics *****
  epoch                   =        1.0
  eval_accuracy           =     0.9162
  eval_loss               =     0.2535
  eval_runtime            = 0:00:44.98
  eval_samples            =       5190
  eval_samples_per_second =    115.365
  eval_steps_per_second   =      7.224


#**RNN MODEL**

In [None]:
df = df[df['title'].notnull()]
df_test = df_test[df_test['title'].notnull()]

In [None]:
df= df.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
import re
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

corpus = []
for i in range(0, len(df['title'])):
    titles = df['title'][i]
    titles = titles.lower()
    titles = titles.split()
    
    titles = [stemmer.stem(word) for word in titles if not word in stopwords.words('english')]
    titles = ' '.join(titles)
    corpus.append(titles)

In [None]:
test_corpus = []
for i in range(0, len(df_test['title'])):
    test_title = df_test['title'][i]
    test_title = test_title.lower()
    test_title = test_title.split()
    
    test_title = [stemmer.stem(word) for word in test_title if not word in stopwords.words('english')]
    test_title = ' '.join(test_title)
    test_corpus.append(test_title)

In [None]:
from tensorflow.keras.preprocessing.text import one_hot

In [None]:
voc_size = 500
onehot_corpus = [one_hot(words, voc_size) for words in corpus]

In [None]:
onehot_test = [one_hot(words, voc_size) for words in test_corpus]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
sent_length = 20

In [None]:
X_final=np.array(pad_sequences(onehot_corpus, padding="post", maxlen=sent_length))
y_final=np.array(df['label'])

In [None]:
X_final_test=np.array(pad_sequences(onehot_test, padding="post", maxlen=sent_length))
y_final_test=np.array(df_test['label'])

In [None]:
X_final = X_final[:int(len(X_final)*0.95)]
y_final = y_final[:int(len(y_final)*0.95)]
x_valid = X_final[:int(len(X_final)*0.95)]
y_valid = y_final[:int(len(y_final)*0.95)]

In [None]:
import keras
#input length is the number of words in the titles
#500 is the vocab size

embedding_vector_features = 400
model = keras.Sequential([ 
    keras.layers.Embedding(500,embedding_vector_features,input_length=20),
    keras.layers.Dropout(0.4),
    keras.layers.LSTM(100),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 400)           200000    
                                                                 
 dropout (Dropout)           (None, 20, 400)           0         
                                                                 
 lstm (LSTM)                 (None, 100)               200400    
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 400,501
Trainable params: 400,501
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_final,y_final,validation_data=(x_valid,y_valid),epochs=40,batch_size=64)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7fab61500450>

In [None]:
model.evaluate(X_final_test,y_final_test)



[3.6327438354492188, 0.6298342347145081]