FakeNewsSpacy.py

# -*- coding: utf-8 -*-
"""FakeNewsSpacy.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1jQsmJLbo484eMNtG969dlGRd-7wavNFC
"""

# !rm -rf data
# !mkdir data
# !wget https://datasets.aicrowd.com/default/aicrowd-practice-challenges/public/fnews/v0.1/train.zip
# !wget https://datasets.aicrowd.com/default/aicrowd-practice-challenges/public/fnews/v0.1/val.zip
# !wget https://datasets.aicrowd.com/default/aicrowd-practice-challenges/public/fnews/v0.1/test.zip
# !unzip train.zip
# !unzip val.zip
# !unzip test.zip
# !mv train.csv data/train.csv
# !mv val.csv data/val.csv
# !mv test.csv data/test.csv

# !pip install -U spacy[cuda92]

import pandas as pd
import numpy as np
import re
from gensim.parsing import remove_stopwords
import spacy
from spacy.util import minibatch
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, roc_curve, auc
import random

#import nltk
#nltk.download('wordnet')
#from nltk.stem import WordNetLemmatizer

# NOTE - lemmatization decreases F1 score by large amount

train_path = "data/FakeNewsData/train.csv" #path where train data is stored
val_path = "data/FakeNewsData/val.csv" #path where val data is stored
train_df = pd.read_csv(train_path) #load data in dataframe using pandas
val_df = pd.read_csv(val_path)

train_df.head()

val_df.head()

train_df['label'].value_counts()

def clean_data(text):    
    text = re.sub('@[\w]*', '', text)   # remove @user
    text = re.sub('&amp;','',text)             # remove &amp;
    text = re.sub('[?!.;:,,#@-]', '', text)  # remove special characters
    text = re.sub(r'[^\x00-\x7F]+', '', text) # remove Unicode characters
    text = text.replace("[^A-Za-z#]", "") # Replace everything except alphabets and hash
    text = text.lower() # make everything lowercase for uniformity    
    # removing short words which are of length 3 or lower(eg. hmm, oh) since they dont add any value
    text = " ".join(w for w in text.split() if len(w)>3)    
    # removing stop-words eg. 'we', 'our', 'ours', 'ourselves', 'just', 'don', "don't", 'should'
    text = remove_stopwords(text)    
    return text

# split each tweet into words, then lemmatize each word and rejoin them to a sentence
# def lemmatize_text(text):
#     words = text.split()
#     lemm = WordNetLemmatizer()
#     lemmatized_words = [lemm.lemmatize(word) for word in words]    
#     return "".join(lemmatized_words)


train_df.drop_duplicates()
print(train_df.isna().sum())

texts_train = train_df['text']
labels_train = train_df['label']

# clean the texts
texts_train = texts_train.apply(lambda x : clean_data(x))
# texts_train = texts_train.apply(lambda x : lemmatize_text(x))

spacy.prefer_gpu()

# Create a blank spacy model
nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe("textcat", config={
                                        "exclusive_classes": True,
                                        "architecture": "simple_cnn"
                                        })

textcat.add_label("real")
textcat.add_label("fake")

nlp.add_pipe(textcat, last=True)

# prepare the training data
texts_train = texts_train.values
labels_cats = [{'cats': {"real": (label=='real'), "fake": (label=='fake')}} for label in labels_train]

data = list(zip(texts_train,labels_cats))

print(data[:1])

# ***************  train the model ********************************************
# spacy.prefer_gpu()

print("Begining training.....")

spacy.util.fix_random_seed(1)

optimizer = nlp.begin_training()

for epoch in range(2):
    losses = {}
    random.shuffle(data)
    batches = minibatch(data,size=500)
    for batch in batches:
        texts,labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

# *****************  evaluate on validation set  ************************************

print("Evaluating on validation set.....")


texts_val = val_df['text']

labels_val_org = val_df['label']

labels_val = [0 if lbl == 'real' else 1 for lbl in labels_val_org]

# clean the texts
texts_val = texts_val.apply(lambda x : clean_data(x))
# texts_val = texts_val.apply(lambda x : lemmatize_text(x))

# convert test tweets to list of nlp docs
val_docs = list(nlp.pipe(texts_val))

# get the text-categorizer pipe
textcat = nlp.get_pipe('textcat')

scores, _ = textcat.predict(val_docs)

predicted_classes = scores.argmax(axis=1)

correct_predictions = predicted_classes==labels_val

accuracy = correct_predictions.mean()

print("Accuracy=", accuracy)

print("F1 score=", f1_score(predicted_classes,labels_val))

print("ROC AUC score = ", roc_auc_score(labels_val, predicted_classes))

false_positive_rate, true_positive_rate, thresholds = roc_curve(labels_val, predicted_classes)
print("auc=", auc(false_positive_rate, true_positive_rate))
print("average precision score=", average_precision_score(labels_val, predicted_classes))

#%%
# test set

# free up memory
train_df = None
val_df = None
texts_train = None
labels_train = None
labels_cats = None
data = None
optimizer = None
texts_val = None
labels_val = None
labels_val_org = None
val_docs = None
predicted_classes = None
correct_predictions = None

print("Test set.....")

test_path = "data/FakeNewsData/test.csv"
test_df = pd.read_csv(test_path)

print(test_df.head())

texts_test= test_df['text']

# clean the texts
texts_test = texts_test.apply(lambda x : clean_data(x))

# convert test tweets to list of nlp docs
test_docs = list(nlp.pipe(texts_test))

# get the text-categorizer pipe
textcat = nlp.get_pipe('textcat')
#%%
scores = None
predicted_classes = None

scores, _ = textcat.predict(test_docs)
#%%
predicted_classes = scores.argmax(axis=1)

submission = ['real' if lbl == 0 else 'fake' for lbl in predicted_classes]

submission = pd.DataFrame(submission)
submission.to_csv('submission.csv',header=['label'],index=False)

# download the generated csv file
# try:
#   from google.colab import files
#   files.download('submission.csv')
# except ImportError as e:
#   print("Only for Colab")