<a href="https://colab.research.google.com/github/JasonArmitage-res/Lab_Multimodal_ML/blob/master/Multimodal_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import json
import pandas as pd
from PIL import Image
import nltk
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
import numpy as np
import random


'''
USAGE:

data = Read(folder_path)
ViewDataDistribution(data)
data.to_pickle("./mm_imdb.pkl")

'''
def Read(path) :
    im_files = os.listdir(path)
    current_directory = os.getcwd()
    os.chdir(path)
   
    #images_cv will return list of numpy arrays containing RGB values
    images_cv = []
    for a in im_files: # Iterate over a copy of the list
        if a.endswith(".jpeg"):
            images_cv.append(Image.open(a).resize((256,256)).convert('RGB'))

    images_cv = pd.DataFrame(images_cv, columns = ['image'])
    
    json_files = [pos_json for pos_json in os.listdir(path) if pos_json.endswith('.json')]
    jsons_data = pd.DataFrame(columns=['genres', 'plot', 'fileID'])
    for index, js in enumerate(json_files):
        with open(os.path.join(path, js)) as json_file:
            json_text = json.load(json_file)
            genres = json_text['genres']
            plot = json_text['plot'][0]
            fileID = js.rstrip(".json")
            #jsons_data will return a dataframe with genres,file names and plot
            jsons_data.loc[index] = [genres,plot,fileID]
           
    os.chdir(current_directory)
   
    return (pd.concat([jsons_data, images_cv], axis=1))

In [0]:
def ViewDataDistribution(data):
    
    all_genres = sum(data["genres"],[])
    unique_genres = (set(all_genres))
    all_genres = nltk.FreqDist(all_genres)
    all_genres_df = pd.DataFrame({'Genre': list(all_genres.keys()), 
                              'Count': list(all_genres.values())})
    plt.figure(figsize=(15,12)) 
    ax = sns.barplot(data=all_genres_df, x= "Count", y = "Genre") 
    for p in ax.patches:
        ax.annotate("%d" % p.get_width(), (p.get_x() + p.get_width(), p.get_y() + 0.5), xytext=(5,0), textcoords='offset points')
    
    plt.show()


def RemoveGenresFromData(data):
    
    genres_to_remove = ["Adult","News","Talk-Show","Reality-TV"]
    data_genres_removed = data[~np.array([bool(set(genre) & set(genres_to_remove)) for genre in data["genres"]])] 
    
    return(data_genres_removed)




def MoviesPerGenre(data):
    
    data_single_genre = pd.DataFrame({
                                  col:np.repeat(data[col].values, data["genres"].str.len())
                                  for col in data.columns.drop("genres")}
                                ).assign(**{"genres":np.concatenate(data["genres"].values)})[data.columns]
    fileID_by_genre = data_single_genre.groupby("genres")["fileID"].apply(list).reset_index(name='fileIDs')
    
    return(fileID_by_genre)

In [0]:
'''
USAGE:

samples = SamplingByCount(data,338)
ViewDataDistribution(samples)
samples.to_pickle("./mm_imdb_sampled.pkl")

'''
def SamplingByCount (data, count = 330) :
    
    data_subset = RemoveGenresFromData(data)
    fileID_by_genre = MoviesPerGenre(data_subset)
    samples_fileIDs = []

    for index, row in fileID_by_genre.iterrows():
    
        samples_fileIDs.extend(random.sample(row["fileIDs"],count))
        
    data_sampled = data_subset[data_subset["fileID"].isin(samples_fileIDs)]
    
    return (data_sampled)

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

'''
USAGE:

Data_train, Data_test, Labels_train_tensor, Labels_test_tensor = Train_Test_Split(data)

'''

def Train_Test_Split(data , test_data_fraction = 0.2) :
    
    mlb = MultiLabelBinarizer()
    data_genres_one_hot_encoding = mlb.fit_transform(data['genres'])
    Label_names = mlb.classes_
    data_genres_one_hot_encoding = pd.DataFrame(data_genres_one_hot_encoding, columns = mlb.classes_)
    Data_train, Data_test, Labels_train, Labels_test = train_test_split(data, data_genres_one_hot_encoding, test_size = test_data_fraction)
    Labels_train = torch.tensor(Labels_train.values)
    Labels_test = torch.tensor(Labels_test.values)

    return (Data_train, Data_test, Labels_train, Labels_test, Label_names)

In [0]:
import nltk
from transformers import BertTokenizer
from nltk import tokenize
#nltk.download('punkt')


'''
USAGE:

bert_input = BertInput()
input_ids = list(Data_train['plot'].apply(lambda x: bert_input.GenerateBertInput(x)))
input_ids = pd.DataFrame(input_ids,columns=['indexed_tokens','segment_ids','masked_ids'])
Data_train_tensor = torch.tensor(input_ids['indexed_tokens'])
torch.save(Data_train_tensor, '/content/drive/My Drive/Dataset/Data_train_tensor.pt')


input_ids = list(Data_test['plot'].apply(lambda x: bert_input.GenerateBertInput(x)))
input_ids = pd.DataFrame(input_ids,columns=['indexed_tokens','segment_ids','masked_ids'])
Data_test_tensor = torch.tensor(input_ids['indexed_tokens'])
torch.save(Data_test_tensor, '/content/drive/My Drive/Dataset/Data_test_tensor.pt')

'''

#Generates formatted input (for Bert) from text
class BertInput () :
    
    def __init__(self, max_input_length = 512):

        self.indexed_tokens = []
        self.segment_ids = []
        self.masked_ids = []
        self.max_input_length = max_input_length

    def GetIndexedTokens(self, text):
        
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        #tagged_text = self.AddSpecialTokens(text)
        #tokenized_text = tokenizer.tokenize(tagged_text)
        tokenized_text = tokenizer.tokenize(text)
        tokenized_text.append("[SEP]")
        tokenized_text.insert(0,"[CLS]")
        self.indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    '''
    def AddSpecialTokens (self, text) : 
    
        #add [CLS] token in the beggining of every text, [SEP] after every sentence
        sentences = tokenize.sent_tokenize('[CLS] ' + text)
        sentences = [x + ' [SEP]' for x in sentences]
        return (' '.join(sentences))
    '''

    def GetSegmentIds(self) :
        
        self.segment_ids = [1] * len(self.indexed_tokens)
 
    def GetMaskedIds(self) :
        
        self.masked_ids = [1] * len(self.indexed_tokens)

    def Padding(self) :

        if(len(self.indexed_tokens) < self.max_input_length) :
           padding = [0]*(self.max_input_length - len(self.indexed_tokens))
           self.indexed_tokens += padding
           self.segment_ids += padding
           self.masked_ids += padding
        else :
           del self.indexed_tokens[self.max_input_length:]
           del self.segment_ids[self.max_input_length:]
           del self.masked_ids[self.max_input_length:]

    def GenerateBertInput(self, text) :

        self.GetIndexedTokens(text)
        self.GetSegmentIds()
        self.GetMaskedIds()
        self.Padding()
        #result = [torch.tensor(self.indexed_tokens),torch.tensor(self.segment_ids),torch.tensor(self.masked_ids)]
        
        return [self.indexed_tokens, self.segment_ids, self.masked_ids]

In [0]:
#ref: https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d
from transformers import BertModel
from torch import nn

class BertMultiLabelClassifier(nn.Module):

    def __init__(self, config, num_labels = 23, dropout = 0.1):

        super(BertMultiLabelClassifier, self).__init__()
        self.num_labels = num_labels
        self.base_model = BertModel.from_pretrained('bert-base-uncased', config=config)
        self.hidden_layer = torch.nn.Linear(config.hidden_size, num_labels)
        self.output_layer = torch.nn.Sigmoid()
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, indexed_tokens, segment_ids=None, masked_ids=None):
        
        pooled_output = self.base_model(indexed_tokens, segment_ids, masked_ids)
        dropped_layer = self.dropout(pooled_output[1])
        logits = self.hidden_layer(dropped_layer)
        if(self.training) :
            return logits
        else :
            output = self.output_layer(logits)
            return output


    def freeze_bert_encoder(self):
        for param in self.base_model.parameters():
            param.requires_grad = False


    def unfreeze_bert_encoder(self):
        for param in self.base_model.named_parameters():
            param.requires_grad = True
    

In [0]:
import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
#!pip install transformers
from transformers import BertConfig, AdamW
from sklearn import metrics
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
import pandas as pd
import time
import datetime


class Training_Testing_Bert():

    def __init__(self, Data_train_tensor, Labels_train_tensor, Data_test_tensor, Labels_test_tensor, 
                 Label_names = None, batch_size = 16, epochs = 5, freeze_bert = True, sigmoid_thresh = 0.2, 
                 optim_lr = 2e-5, num_labels = 23):
      
      self.config = BertConfig.from_pretrained('bert-base-uncased')
      self.bert = BertMultiLabelClassifier(config).cuda()
      
      #do not train the Bert model
      if(freeze_bert) :
        self.bert.freeze_bert_encoder()

      self.label_names = Label_names
      self.num_labels = num_labels
      self.batch_size = batch_size
      self.optim_lr = optim_lr
      self.epochs = epochs
      self.sigmoid_thresh = sigmoid_thresh
      self.optimizer = self.SetOptimizer()
      #self.scheduler = self.SetScheduler()
      self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      self.results = pd.DataFrame(0, index=['Recall','Precision','F_Score'], columns=['micro', 'macro', 'weighted', 'samples']).astype(float)
      self.epoch_loss_set = []
      self.train_dataloader = self.SetTrainDataloader(Data_train_tensor, Labels_train_tensor)
      self.test_dataloader = self.SetTestDataloader(Data_test_tensor, Labels_test_tensor)


    def SetOptimizer(self) :

      optimizer = AdamW(self.bert.parameters(), self.optim_lr, eps = 1e-6)
      return(optimizer)

    
    '''
    def SetScheduler(self) :

      scheduler =
      return(scheduler) 
    '''


    def Get_Metrics(self, actual, predicted) :

      #acc = metrics.accuracy_score(actual, predicted)
      #hamming = metrics.hamming_loss(actual, predicted)
      #(metrics.roc_auc_score(actual, predicted, average=average)
      averages = ('micro', 'macro', 'weighted', 'samples')
      for average in averages:
          precision, recall, fscore, _ = metrics.precision_recall_fscore_support(actual, predicted, average=average)
          self.results[average]['Recall'] += recall
          self.results[average]['Precision'] += precision
          self.results[average]['F_Score'] += fscore


    #source: https://mccormickml.com/2019/07/22/BERT-fine-tuning/
    def Plot_Training_Epoch_Loss(self) :

      sns.set(style='darkgrid')
      sns.set(font_scale=1.5)
      plt.rcParams["figure.figsize"] = (12,6)
      plt.plot(self.epoch_loss_set, 'b-o')
      plt.title("Training loss")
      plt.xlabel("Epoch")
      plt.ylabel("Loss")
      plt.show()


    #source: https://mccormickml.com/2019/07/22/BERT-fine-tuning/
    def format_time(self, elapsed):
      '''
      Takes a time in seconds and returns a string hh:mm:ss
      '''
      # Round to the nearest second.
      elapsed_rounded = int(round((elapsed)))
      return str(datetime.timedelta(seconds=elapsed_rounded))


    def SetTrainDataloader(self, Data_train_tensor, Labels_train_tensor) :

      train_dataset = TensorDataset(Data_train_tensor, Labels_train_tensor)
      train_sampler = RandomSampler(train_dataset)
      train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size = self.batch_size)
      return(train_dataloader)


    def SetTestDataloader(self, Data_test_tensor, Labels_test_tensor) :
      
      test_dataset = TensorDataset(Data_test_tensor, Labels_test_tensor)
      test_sampler = SequentialSampler(test_dataset)
      test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size = self.batch_size)
      return(test_dataloader)



    def Train_Bert(self) :

      for _ in trange(self.epochs, desc="Epoch"):
        
        self.bert.train()
        epoch_loss = 0

        # Measure how long the training epoch takes.
        t0 = time.time()
    
        for step_num, batch_data in enumerate(self.train_dataloader):

          # Progress update every 30 batches.
          if step_num % 30 == 0 and not step_num == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch : ',step_num, ' , Time elapsed : ',elapsed)

          token_ids, labels = tuple(t.to(self.device) for t in batch_data)
          self.optimizer.zero_grad()
          logits = self.bert(token_ids)
          loss_fct = BCEWithLogitsLoss()
          batch_loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1, self.num_labels).float())
          batch_loss.backward()
          #scheduler.step()
          self.optimizer.step()
          epoch_loss += batch_loss.item()

      avg_epoch_loss = epoch_loss/len(self.train_dataloader)
      print("\nTrain loss for epoch: ",avg_epoch_loss)
      print("\nTraining epoch took: {:}".format(format_time(time.time() - t0)))
      self.epoch_loss_set.append(avg_epoch_loss)

      torch.save(self.bert.state_dict(), "/content/drive/My Drive/Dataset/bert_unimodal.pt")
      self.Plot_Training_Epoch_Loss()
    

    def Test_Bert(self) :

      # Put model in evaluation mode to evaluate loss on the test set
      self.bert.eval()

      for batch_data in self.test_dataloader:
  
        token_ids, labels = tuple(t.to(device) for t in batch_data)
      
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        # Forward pass, calculate logit predictions
        with torch.no_grad():
          output = self.bert(token_ids)

        threshold = torch.Tensor([self.sigmoid_thresh]).to(self.device)
        predictions = (output > threshold).int()

        # Move preds and labels to CPU
        predictions = predictions.detach().cpu().numpy()
        labels = labels.to('cpu').numpy()
      
        self.Get_Metrics(labels, predictions)
    
      self.results = self.results/len(self.test_dataloader)
      print("Test data metrics : \n")
      self.results
    