In [17]:
# Additional libraries apart from the stanard ML libraries
!pip install pytorch-nlp
!pip install torchvision
!pip install torch-summary

In [18]:
import time
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import re
from tqdm import tqdm_notebook
tqdm_notebook().pandas()
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

import torch
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable

# Read Data

In [19]:
df = pd.read_csv('crafted_features.csv')

In [3]:
print(list(df.columns))

['UserID', 'createdAt', 'CollectedAt', 'NumberOfFollowings', 'NumberOfFollowers', 'NumberOfTweets', 'LengthOfScreenName', 'LengthOfDescriptionInUserProfile', 'userType', 'LinkRatio', 'uniqueLinksRatio', 'AtRatio', 'uniqueATRatio', 'averageWords', 'averageChars', 'averageUppercase', 'averageLowercase', 'averagePunctuation', 'LogNumberOfFollowings', 'LogNumberOfFollowers', 'LogNumberOfTweets', 'ratioFollowings_Followers', 'SeriesOfNumberOfFollowings', 'rate_change', 'rate_change_std', 'mean_followers', 'maxTweetDay', 'averageTweetDay', 'sequence', 'sequenceEntropy', 'SequenceRate_Change', 'RatioReplySequence', 'LastTweet', 'LastReply', 'entropyReplyTweetInterval', 'LinkCompressionLength', 'CompressLength', 'first_30_days', 'averageTweetInterval', 'startEndDiff', 'largestFollowerChange', 'longevity', 'retweetCount', 'uniqueHashtags', 'HashtagCount', 'emojis', 'abbrivations', 'textSim', 'hashtagsEntropy', 'HashtagSim', 'LangCount', 'SentimentPositive', 'SentimentNeutral', 'SentimentNegativ

The list above contains the features used within this neural network. Initially, there was another dataset containing each user's tweets; however, for easier processing for the word embeddings, the "cleanTweets" column includes all tweets by a single user concatenated with each other. This concatenation process occurs for each of the users. Furthermore, the text within "cleanTweets" has undergone preprocessing.

# Import GloVe Word embedding

In [20]:
def load_embedding(path):
    embeddings_index = dict()
    f = open(path, 'r' ,encoding="utf8")

    for line in f:
        try:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        except:
            f.__next__()

    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index


def create_embedding_matrix(size_of_vocabulary, pretrained_embedding_dim, tokenizer, embeddings_index):
    # Create empty matrix
    embedding_weights = torch.Tensor(size_of_vocabulary, pretrained_embedding_dim)
    # Add relevant word into matrix
    # If not included add a random intialisation based on a normal distribution
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_weights[i] = embedding_vector
        else:
            embedding_weights[i] = torch.from_numpy(np.random.normal(scale=0.6, size=(pretrained_embedding_dim, )))
    return embedding_weights
    

In [5]:
pretrained_embedding_dim = 300
embeddings_index = load_embedding('glove.840B.300d.txt')


Loaded 2195864 word vectors.


# Pre-Process Data

In [6]:
X = df.copy()
# These features are all non-numeric
del X['UserID']
del X['createdAt']
del X['CollectedAt']
del X['userTweets']
del X['SeriesOfNumberOfFollowings']
del X['sequence']
y = X['userType']
del X['userType']

# Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

columns = [col for col in X_train.columns if col != 'cleanTweets']

# Min-Max Normalisation
X_train[columns]=(X_train[columns]-X_train[columns].min())/(X_train[columns].max()-X_train[columns].min())
X_test[columns]=(X_test[columns]-X_test[columns].min())/(X_test[columns].max()-X_test[columns].min())

X2_train = X_train[columns].values
X2_test = X_test[columns].values

The above code has ensured that all the numerical features within the dataset have been normalised for both the training and testing set. They were then set as "X2_train/test" as this would be the second input for the neural network.

# Text Tokenization & Word Embedding

In [7]:
def vocab_size(df):
    # Tokenize the sentences
    tokenizer = Tokenizer(num_words=5000)
    # Preparing vocabulary on whole dataset
    tokenizer.fit_on_texts(df['cleanTweets'].tolist())
    return len(tokenizer.word_index) + 1

# Tokenize the sentences
tokenizer = Tokenizer(num_words=5000)

# Preparing vocabulary
tokenizer.fit_on_texts(X_train['cleanTweets'].tolist())

# Convert Text to integer sequences
X1_train = tokenizer.texts_to_sequences(X_train['cleanTweets'].tolist()) 
X1_test  = tokenizer.texts_to_sequences(X_test['cleanTweets'].tolist())

# Max length of padding
maxlen = 150

# Padding to ensure all entries are the same length
X1_train  = torch.Tensor(pad_sequences(X1_train, maxlen=maxlen, padding='post'))
X1_test = torch.Tensor(pad_sequences(X1_test, maxlen=maxlen, padding='post'))

# Get word vocabulary size
size_of_vocabulary = vocab_size(df)

# Create word embedding
embedding_weights = create_embedding_matrix(size_of_vocabulary, pretrained_embedding_dim, tokenizer, embeddings_index)

The above code creates a vocabulary list where each word is assigned a number. Each text is then converted to an integer sequence. To ensure all text sequences are the same length, they are padded to a set length of 150. This then creates the "X1_train/test" variables, which is the first input for the neural network. It then creates the embedding weights based on the text in dataset using the GloVe pretrained word embedding.

# Creating Neural Network

In [8]:
class Two_Input_Net(nn.Module):
    def __init__(self,hidden_size,lin_size, embedding_matrix=embedding_weights):
        super(Two_Input_Net, self).__init__()

        # Layer 1a: Embedding Layer (Input)
        self.embedding = nn.Embedding(size_of_vocabulary, pretrained_embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False

        # Layer 1b: Bidirectional LSTM
        self.lstm = nn.LSTM(pretrained_embedding_dim, maxlen, bidirectional=True, batch_first=True)
        
        # Layer 1c: MaxPooling, BatchNormalisation and Dropout
        self.globalMaxPool = nn.AdaptiveMaxPool1d(1)
        self.batchnorm = nn.BatchNorm1d(maxlen)
        self.dropout = nn.Dropout2d(0.5)
        
        # Layer 2a: Dense Layer (Input)
        self.linear_one = nn.Linear(X2_train.shape[1], 10)
        self.relu = nn.ReLU()
        
        # Layer 2b: Dense Layer
        self.linear_two = nn.Linear(10, lin_size)
        self.relu = nn.ReLU()
        
        # Layer 3a: Concatation Section which is described in forward().
        
        # Layer 3b: Dense Layer
        self.linear = nn.Linear(300, 10)
        self.relu = nn.ReLU()

        # Layer 4: Output Layer return probability value from 0 to 1.
        self.out = nn.Linear(10, 1)
        self.outFunc = nn.Sigmoid()

    def forward(self, x):
        # x[0] is the text feautes
        # x[1] is the numerical profile features
        
        # LSTM Layer
        h_embedding = self.embedding(x[0].long())
        h_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0))
        x[0], _  = self.lstm(h_embedding)
        
        # MaxPooling, BatchNormalisation and Dropout Layer
        x[0] = self.globalMaxPool(x[0])
        x[0] = self.batchnorm(x[0])
        x[0] = self.dropout(x[0])

        # Ensure x[1] is tensor format as float
        # 2 Dense Layers
        x[1] = torch.tensor(x[1], dtype=torch.float)
        x[1] = self.relu(self.linear_one(x[1]))
        x[1] = self.relu(self.linear_two(x[1]))

        # Layer 3a: Concatenate the outputs of x[0] and x[1]
        # x[0] has the 3rd dimension removed as its in the form [??,??,1] (ensures both x[0] and x[1] are same shape)
        x[0] = x[0][:, :, -1]
        conc = torch.cat((x[0],x[1]), 1)

        # Concatenated tensor passed through Dense Layer
        conc = self.relu(self.linear(conc))
        # Dense Layer which uses sigmoid activation function to return value between 0 and 1.
        out = self.out(conc)
        outF = self.outFunc(out)
        return outF

In [16]:
# Intialise Model
model = Two_Input_Net(X2_train.shape[1], X1_train.shape[1])

# Convert profile features to Variable format
X2_train = Variable(torch.Tensor(X2_train).float())
X2_test = Variable(torch.Tensor(X2_test).float())

# # Convert to Variable format
y_train = np.array(y_train)
y_train = Variable(torch.LongTensor(y_train))
y_test = np.array(y_test)
y_test = Variable(torch.LongTensor(y_test))

In [22]:
from torchsummary import summary
summary(model)

Layer (type:depth-idx)                   Param #
├─Embedding: 1-1                         (201,900)
├─LSTM: 1-2                              542,400
├─AdaptiveMaxPool1d: 1-3                 --
├─BatchNorm1d: 1-4                       300
├─Dropout2d: 1-5                         --
├─Linear: 1-6                            940
├─ReLU: 1-7                              --
├─Linear: 1-8                            1,650
├─Linear: 1-9                            3,010
├─Linear: 1-10                           11
├─Sigmoid: 1-11                          --
Total params: 750,211
Trainable params: 548,311
Non-trainable params: 201,900


Layer (type:depth-idx)                   Param #
├─Embedding: 1-1                         (201,900)
├─LSTM: 1-2                              542,400
├─AdaptiveMaxPool1d: 1-3                 --
├─BatchNorm1d: 1-4                       300
├─Dropout2d: 1-5                         --
├─Linear: 1-6                            940
├─ReLU: 1-7                              --
├─Linear: 1-8                            1,650
├─Linear: 1-9                            3,010
├─Linear: 1-10                           11
├─Sigmoid: 1-11                          --
Total params: 750,211
Trainable params: 548,311
Non-trainable params: 201,900

# Creating Custom Dataset & Data Loaders

In [11]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self,X1,X2,Y):
        self.X1 = X1
        self.X2 = X2
        self.Y = Y
    def __len__(self):
        return len(self.X1)
    def __getitem__(self, idx):
        text_data = self.X1[idx]
        meta_data = self.X2[idx]
        label = self.Y[idx]
        return text_data, meta_data, label

In [12]:
# Create the Custom Dataset for both training and validation set
training_dataset = CustomDataset(X1_train, X2_train, y_train)
eval_dataset = CustomDataset(X1_test, X2_test, y_test)

In [13]:
# Create the data loaders for both the training and validation set
train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=128, shuffle=True)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=128, shuffle=False)

# Main Model Run

In [14]:
def calculate_accuracy(y_pred, y):
    binaryConverted = torch.round(y_pred)
    correct = (binaryConverted == y).sum() 
    return correct.float() / y.shape[0]

def model_train(epoch):
    train_loss = 0
    train_acc = 0
    
    # MODEL: TRAIN
    model.train()
    
    print(f"---------------------Starting Epoch.. {(epoch+1)} ---------------------")
    progress_bar = tqdm(train_loader, desc='Processing Epoch {:1d}'.format((epoch+1)), leave=False, disable=False)
    for text_data, meta_data, labels in progress_bar:
        # Clear the gradients
        optimizer.zero_grad()
        # Compute the model output
        yhat = model([text_data, meta_data])
        # Calculate loss
        labels = labels.unsqueeze(1)
        loss = criterion(yhat, labels.float())
        # Calculate accuracy
        acc = calculate_accuracy(yhat, labels.float())
        # Credit assignment
        loss.backward()
        # Update model weights
        optimizer.step()
        # Total training loss and accuracy
        train_loss += loss.item()
        train_acc += acc.item()
        
    return (train_loss/len(train_loader)), (train_acc/len(train_loader))

def model_eval(epoch):
        # MODEL: EVAL
        model.eval()

        eval_loss = 0
        eval_acc = 0

        with torch.no_grad():
            for text_data, meta_data, labels in eval_loader:
                yhat = model([text_data, meta_data])
                # calculate loss
                labels = labels.unsqueeze(1)
                loss = criterion(yhat, labels.float())

                # Calculate accuracy
                acc = calculate_accuracy(yhat, labels.float())
                
                # Total training loss and accuracy
                eval_loss += loss.item()
                eval_acc += acc.item()
                                           
        return (eval_loss/len(eval_loader)), (eval_acc/len(eval_loader))
    
def output(epoch, time, train_loss, train_acc, eval_loss, eval_acc):
    print(f'Epoch: {(epoch+1)} | Epoch Duration (Seconds): {time}')
    print(f'\tTrain Loss: {train_loss:.5f} | Train Accuracy: {train_acc:.5f}')
    print(f'\t Val. Loss: {eval_loss:.5f} |  Val. Accuracy: {eval_acc:.5f}')
    print(f"-------------------------------------------------------------")
    

In [15]:
TOTAL_EPOCH = 8

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

best_accuracy = -float('inf')

for epoch in range(TOTAL_EPOCH):
    # Model TRAIN
    start_time = time.time()
    train_loss, train_acc = model_train(epoch)
    end_time = time.time()
    
    # Model Eval
    eval_loss, eval_acc = model_eval(epoch)
    
    # Model output details
    output(epoch, (end_time-start_time), train_loss, train_acc, eval_loss, eval_acc)
    
    
    # Save Model with best accuracy
    if (eval_acc > best_accuracy):
        print("Updated Best Saved Model With Accuracy", eval_acc)
        best_accuracy = eval_acc
        torch.save(model, 'two_input_NN_best_model.pt')
        

---------------------Starting Epoch.. 1 ---------------------


Processing Epoch 1:   0%|          | 0/260 [00:00<?, ?it/s]



Epoch: 1 | Epoch Duration (Seconds): 261.3110659122467
	Train Loss: 0.18284 | Train Accuracy: 0.92526
	 Val. Loss: 0.13081 |  Val. Accuracy: 0.95422
-------------------------------------------------------------
Updated Best Saved Model With Accuracy 0.954221299978403
---------------------Starting Epoch.. 2 ---------------------


Processing Epoch 2:   0%|          | 0/260 [00:00<?, ?it/s]

Epoch: 2 | Epoch Duration (Seconds): 252.4938929080963
	Train Loss: 0.11738 | Train Accuracy: 0.95693
	 Val. Loss: 0.10875 |  Val. Accuracy: 0.96164
-------------------------------------------------------------
Updated Best Saved Model With Accuracy 0.9616380150501544
---------------------Starting Epoch.. 3 ---------------------


Processing Epoch 3:   0%|          | 0/260 [00:00<?, ?it/s]

Epoch: 3 | Epoch Duration (Seconds): 253.65692019462585
	Train Loss: 0.09230 | Train Accuracy: 0.96599
	 Val. Loss: 0.08776 |  Val. Accuracy: 0.96605
-------------------------------------------------------------
Updated Best Saved Model With Accuracy 0.9660499224295983
---------------------Starting Epoch.. 4 ---------------------


Processing Epoch 4:   0%|          | 0/260 [00:00<?, ?it/s]

Epoch: 4 | Epoch Duration (Seconds): 248.55972504615784
	Train Loss: 0.07804 | Train Accuracy: 0.97200
	 Val. Loss: 0.07070 |  Val. Accuracy: 0.97325
-------------------------------------------------------------
Updated Best Saved Model With Accuracy 0.9732468916819645
---------------------Starting Epoch.. 5 ---------------------


Processing Epoch 5:   0%|          | 0/260 [00:00<?, ?it/s]

Epoch: 5 | Epoch Duration (Seconds): 249.85890197753906
	Train Loss: 0.06496 | Train Accuracy: 0.97503
	 Val. Loss: 0.07067 |  Val. Accuracy: 0.97285
-------------------------------------------------------------
---------------------Starting Epoch.. 6 ---------------------


Processing Epoch 6:   0%|          | 0/260 [00:00<?, ?it/s]

Epoch: 6 | Epoch Duration (Seconds): 238.0498423576355
	Train Loss: 0.06847 | Train Accuracy: 0.97411
	 Val. Loss: 0.09833 |  Val. Accuracy: 0.96349
-------------------------------------------------------------
---------------------Starting Epoch.. 7 ---------------------


Processing Epoch 7:   0%|          | 0/260 [00:00<?, ?it/s]

Epoch: 7 | Epoch Duration (Seconds): 238.15432691574097
	Train Loss: 0.06590 | Train Accuracy: 0.97545
	 Val. Loss: 0.06968 |  Val. Accuracy: 0.97378
-------------------------------------------------------------
Updated Best Saved Model With Accuracy 0.9737774381270775
---------------------Starting Epoch.. 8 ---------------------


Processing Epoch 8:   0%|          | 0/260 [00:00<?, ?it/s]

Epoch: 8 | Epoch Duration (Seconds): 232.66365694999695
	Train Loss: 0.05470 | Train Accuracy: 0.97888
	 Val. Loss: 0.06490 |  Val. Accuracy: 0.97550
-------------------------------------------------------------
Updated Best Saved Model With Accuracy 0.9754953384399414
