<a href="https://colab.research.google.com/github/stavIatrop/Fake-News-Detection/blob/master/one_layer_neural_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import and split the data

In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

data = pd.read_csv("drive/My Drive/datasets/politifact.csv", ",")
data_labels = data['label'].values
data = data['text'].values

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(data, data_labels):
    X_train, X_test = data[train_index], data[test_index]
    Y_train, Y_test = data_labels[train_index], data_labels[test_index]

print("Train shape : ",X_train.shape)
print("Test shape : ",X_test.shape)

Train shape :  (559,)
Test shape :  (140,)


Preprocess the data.

In [0]:
import re

def remove_non_ascii(X):
  for i in range(len(X)):
    words = X[i].split()
    filtered_list = []
    for word in words:
        pattern = re.compile('[^\u0000-\u007F]+', re.UNICODE)  #Remove all non-alphanumeric characters
        
        word = pattern.sub(" ", word)
        filtered_list.append(word)
        result = ' '.join(filtered_list)
        
    X[i] = result
  return X

def toLowerCase(X):

  for i in range(len(X)):
    filtered_list = []
    for word in X[i].split():
      word = word.lower()
      filtered_list.append(word)
      result = ' '.join(filtered_list)

    X[i] = result
  
  return X

import string

def handle_punctuation(X):

  for i in range(len(X)):
    filtered_list = []
    
    for word in X[i].split():
      
      cleaned = 0
      
      while(not cleaned):
        punc_word = ""

        if (word[0] in string.punctuation):
          punc_word = word[0]
          if (len(word) == 1):
            cleaned = 1
          else:
            word = word[1:]
          filtered_list.append(punc_word)
          result = ' '.join(filtered_list)
        elif (word[len(word) - 1] in string.punctuation):
          punc_word = word[len(word) - 1]
          word = word[:len(word) - 1]
          filtered_list.append(punc_word)
          result = ' '.join(filtered_list)
        else:
          #word = word.translate(str.maketrans(' ', ' ', string.punctuation))
          t = str.maketrans(dict.fromkeys(string.punctuation, " "))
          word = word.translate(t)
          cleaned = 1
          filtered_list.append(word)
          result = ' '.join(filtered_list)
    
    X[i] = result
  
  return X

def clean_numbers(X):
  for i in range(len(X)):
    x = X[i]
    if bool(re.search(r'\d', x)):
      x = re.sub('[0-9]{4,}', ' ### ', x)
      x = re.sub('[0-9]{3}', ' ## ', x)
      x = re.sub('[0-9]{2}', ' # ', x)
    X[i] = x
  return X

def preprocess_text(X):

  #remove non ascii characters
  X = remove_non_ascii(X)
  #transform to lower case
  X = toLowerCase(X)
  #handle punctuation
  X = handle_punctuation(X)
  #clean numbers
  X = clean_numbers(X)
  return X

In [0]:
X_train = preprocess_text(X_train)
X_test = preprocess_text(X_test)

Load GloVe embeddings and create word index

In [0]:
import numpy as np
def load_glove_index():
    word2idx = dict()
    embeddings_index = dict()
    idx = 0
    EMBEDDING_FILE = "/content/drive/My Drive/GloVe/glove.6B.50d.txt"
    for f in open(EMBEDDING_FILE):
      line = f.split(" ")
      word = line[0]
      word2idx[word] = idx
      idx += 1
      embeddings_index[word] = np.array(line[1:]).astype(np.float)
    return embeddings_index, word2idx

In [0]:
embeddings_index, word2idx = load_glove_index()

In [35]:
embeddings_index[(list(word2idx.keys())[len(word2idx)-1])]  #last token is the unknown token

array([-0.12920076, -0.28866628, -0.01224866, -0.05676644, -0.20210965,
       -0.08389011,  0.33359843,  0.16045167,  0.03867431,  0.17833012,
        0.04696583, -0.00285802,  0.29099807,  0.04613704, -0.20923874,
       -0.06613114, -0.06822549,  0.07665912,  0.3134014 ,  0.17848536,
       -0.1225775 , -0.09916983, -0.07495987,  0.06413227,  0.14441176,
        0.60894334,  0.17463093,  0.05335403, -0.01273871,  0.03474107,
       -0.8123879 , -0.04688699,  0.20193407,  0.2031118 , -0.03935686,
        0.06967544, -0.01553638, -0.03405238, -0.06528071,  0.12250231,
        0.13991883, -0.17446303, -0.08011883,  0.0849521 , -0.01041659,
       -0.13705009,  0.20127155,  0.10069407,  0.00653003,  0.01685157])

Build the training vocab.

In [0]:
def build_vocab(X):
  sentences = [row.split() for row in X]
  vocab = dict()
  for sentence in sentences:
    for word in sentence:
      if word in vocab.keys():
        vocab[word] += 1
      else:
        vocab[word] = 1
  return vocab

In [8]:
vocab = build_vocab(X_train)
print({k: vocab[k] for k in list(vocab)[:10]})

{'george': 240, '.': 51896, 'w': 68, 'bush': 399, 'has': 3036, 'lobbed': 3, 'thinly': 3, 'veiled': 5, 'critiques': 2, 'of': 21340}


Build the embedding weight map with the indices. (either using only vocabulary words or all the pretrained embeddings)

In [0]:
import numpy as np

def build_embedding_weight_matrix( vocab, embeddings_index, usevoc = True):
  EMBEDDING_DIM = 50
  if (usevoc):
    weight_matrix = np.zeros((len(vocab), EMBEDDING_DIM), dtype=float)
    for idx, word in enumerate(vocab):
      if word in embeddings_index.keys():
        weight_matrix[idx] = embeddings_index[word]
      else:
        weight_matrix[idx] = embeddings_index["unknown_emb"]
  else:
    weight_matrix = np.array((len(embeddings_index), EMBEDDING_DIM), dtype=float)
    for idx, word in enumerate(embeddings_index):
      weight_matrix[idx] = embeddings_index[word]
  
  return weight_matrix

In [0]:
emb_weight_matrix = build_embedding_weight_matrix(vocab, embeddings_index, True)
emb_weight_matrix[0:5]

Define neural network architecture.

In [0]:
import torch
import torch.nn as nn

class OneLayerModel(nn.Module):

  def __init__(self):
    super(OneLayerModel, self).__init__()
    self.embeddings = nn.Embedding(emb_weight_matrix.shape[0], emb_weight_matrix.shape[1])