## Preprocessing

In [1]:
# import all the required libraries

import numpy as np
import pandas as pd
import glob
import cv2
import requests  
import re  
from urllib.request import urlretrieve 
import datetime

In [2]:
# read IMDB dataset information from csv file 
movie = pd.read_csv("./MovieGenre.csv", encoding = "ISO-8859-1")
movie.dropna(inplace=True)

# save required columns in a dataframe
df_poster = movie[['imdbId','Poster']]

movie.head(2)

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...


In [36]:
# global variables

img_loc = "original_1000_images/"
SIZE=(150, 101, 3)
num_labels = 10

In [8]:
df_movietotal = movie

# list of all the genres in original dataset

label_dict = {"word2idx": {}, "idx2word": []}
idx = 0
genre_per_movie = df_movietotal["Genre"].apply(lambda x: str(x).split("|"))
for l in [g for d in genre_per_movie for g in d]:
    if l in label_dict["idx2word"] or l=='':
        pass
    else:
        label_dict["idx2word"].append(l)
        label_dict["word2idx"][l] = idx
        idx += 1
n_classes = len(label_dict["idx2word"])
print(label_dict)

{'word2idx': {'Animation': 0, 'Adventure': 1, 'Comedy': 2, 'Action': 3, 'Family': 4, 'Romance': 5, 'Drama': 6, 'Crime': 7, 'Thriller': 8, 'Fantasy': 9, 'Horror': 10, 'Biography': 11, 'History': 12, 'Mystery': 13, 'Sci-Fi': 14, 'War': 15, 'Sport': 16, 'Music': 17, 'Documentary': 18, 'Musical': 19, 'Western': 20, 'Short': 21, 'Film-Noir': 22, 'Talk-Show': 23, 'News': 24, 'Adult': 25, 'Reality-TV': 26, 'Game-Show': 27}, 'idx2word': ['Animation', 'Adventure', 'Comedy', 'Action', 'Family', 'Romance', 'Drama', 'Crime', 'Thriller', 'Fantasy', 'Horror', 'Biography', 'History', 'Mystery', 'Sci-Fi', 'War', 'Sport', 'Music', 'Documentary', 'Musical', 'Western', 'Short', 'Film-Noir', 'Talk-Show', 'News', 'Adult', 'Reality-TV', 'Game-Show']}


In [12]:
# remove poster genres with less data (preserve most common genres only - final popular 10 genres)

df_movietotal['Genre'].replace('Game-Show', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Musical', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Reality-TV', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Adult', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('News', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Talk-Show', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Film-Noir', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Short', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Western', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Sport', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Documentary', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Music', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('War', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('History', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Biography', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Crime', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Fantasy', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('Family', '', inplace=True, regex=True)
df_movietotal['Genre'].replace('', np.nan, inplace=True)

df_movietotal.dropna(inplace=True)


In [13]:
df_movietotal_copy = df_movietotal.copy()

In [14]:
# dictionary to store final genre class names and their corresponding ids

label_dict = {"word2idx": {}, "idx2word": []}
idx = 0
genre_per_movie = df_movietotal_copy["Genre"].apply(lambda x: str(x).split("|"))
for l in [g for d in genre_per_movie for g in d]:
    if l in label_dict["idx2word"] or l=='':
        pass
    else:
        label_dict["idx2word"].append(l)
        label_dict["word2idx"][l] = idx
        idx += 1
n_classes = len(label_dict["idx2word"])
print(label_dict)

{'word2idx': {'Animation': 0, 'Adventure': 1, 'Comedy': 2, 'Action': 3, 'Romance': 4, 'Drama': 5, 'Thriller': 6, 'Horror': 7, 'Mystery': 8, 'Sci-Fi': 9}, 'idx2word': ['Animation', 'Adventure', 'Comedy', 'Action', 'Romance', 'Drama', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi']}


In [16]:
# calculate number of dataset images for each genre class 

def genre_count(df, label_dict):
    max_genre = 0
    for label in label_dict["idx2word"]:
        occurrences = len((df[df['Genre'].str.contains(label)]))
        print(label, occurrences)
        if occurrences > max_genre:
            max_genre = occurrences
    return max_genre

In [17]:
# list of all genres and their corresponding dataset image count

max_genre = genre_count(df_movietotal_copy, label_dict)
print("max count of a label : ",max_genre)

Animation 1704
Adventure 3806
Comedy 12562
Action 5307
Romance 6188
Drama 19850
Thriller 4797
Horror 3981
Mystery 2387
Sci-Fi 2002
max count of a label :  19850


In [19]:
df_movietotal_copy = df_movietotal

In [20]:
# oversampling almost doubles the dataset, not enough processing power to handle that

# # IMBALANCE: OVERSAMPLING SOLUTION
# df_movietotal_copy_2 = df_movietotal_copy
# df_movietotal_copy_2 = df_movietotal_copy_2[~df_movietotal_copy_2["Genre"].str.contains("Comedy")]
# df_movietotal_copy_2 = df_movietotal_copy_2[~df_movietotal_copy_2["Genre"].str.contains("Drama")]
   
# for label in label_dict["idx2word"]:
#     if label not in ["Drama", "Comedy"]:
#         len_genre = len(df_movietotal_copy[df_movietotal_copy['Genre'].str.contains(label)])
#         df_genre = df_movietotal_copy_2[df_movietotal_copy_2['Genre'].str.contains(label)]
#         #df_genre['genres'] = [label+"|" for i in range (0, len(df_genre))]    
#         if (len_genre) > 0:
#             if len_genre > 5000:
#                 param = 1000
#             elif len_genre > 3000:
#                 param = 3000
#             elif len_genre > 1000:
#                 param = 6000
#             else:
#                 param = 6000
#             df_class_over = df_genre.sample(param, replace=True)
#             df_movietotal_copy = pd.concat([df_movietotal_copy, df_class_over], axis=0)


In [21]:
# undersampling of dataset : approximately 1000 images per genre selected (one image contains multiple genres)

df_1000_new = pd.DataFrame()
i=0
for label in label_dict["idx2word"]:
    df_genre = df_movietotal_copy[df_movietotal_copy['Genre'].str.contains(label)]
    if(i%2==0):
        df_class_over = df_genre.iloc[0:1000]
    else:
        df_class_over = df_genre.iloc[-1000:]
    df_1000_new = pd.concat([df_1000_new, df_class_over], axis=0)
    i+=1


In [22]:
df_1000_new.shape

(10000, 6)

In [24]:
print('Random under-sampled Image count :')
print(genre_count(df_1000_new, label_dict))

Random under-sampled Image count :
Animation 1459
Adventure 2091
Comedy 3019
Action 2478
Romance 1720
Drama 4067
Thriller 2175
Horror 2024
Mystery 1528
Sci-Fi 1566
4067


In [25]:
df_movietotal = df_1000_new
#df_movietotal

In [50]:
# download posters using the poster links given in the dataset

not_found = []
http_404 = []
for index, row in df_movietotal.iterrows():
    url = row['Poster']
    #print(index,url)
    if "https://images-na.ssl-images-amazon.com/" in str(url):
        id = row['imdbId']
        jpgname = img_loc+str(id)+'.jpg'
        try:
            urlretrieve(url, jpgname)
        except:
            http_404.append(index)
            pass
        
    else:
        not_found.append(index)

In [51]:
# len(http_404)
# len(not_found)

In [26]:
# store path of each poster and get ids from their path name

image_glob = glob.glob(img_loc + "*.jpg")
img_dict = {}

def get_id(filename):
    index_s = filename.rfind("/") + 1
    index_f = filename.rfind(".jpg")
    return filename[index_s:index_f]

In [30]:
# APPROACH 1 : NORMAL POSTERS - run this block of code for approach 1

for fn in image_glob:
    try:
        img_dict[get_id(fn)] = cv2.imread(fn)
    except:
        pass

In [None]:
# APPROACH 2 : CENTER EXTRACTED POSTERS - run this block of code for approach 2

for fn in image_glob:
    try:
        img_read = cv2.imread(fn)
        img_final = img_read[int(SIZE[0]/8):SIZE[0]-int(SIZE[0]/8),int(SIZE[1]/8):SIZE[1]-int(SIZE[1]/8)]
        cv2.imwrite(center_loc+get_id(fn)+".jpg", img_final)
        img_dict[get_id(fn)] = img_final
    except:
        pass

In [None]:
# APPROACH 3 : OBJECT BOUNDED POSTERS - run this block of code for approach 3

# standard object detection used to extract the portion of poster with objects
import cv2
import matplotlib.pyplot as plt
import cvlib as cv
from cvlib.object_detection import draw_bbox
import numpy as np

def object_detector(im,SIZE=(150, 101, 3)):
    # all bounding boxes coordinates extracted from posters
    bbox, label, conf = cv.detect_common_objects(im)
    # if no object found, return the original poster
    if(bbox==[]):
        return im
    
    minimum = np.min(bbox,axis=0)
    maximum = np.max(bbox,axis=0)
    
    # calculate minimum x,y and maximum x,y from all the bounding box coordinates
    f_x_min = np.min([minimum[0],minimum[2]])
    f_y_min = np.min([minimum[1],minimum[3]])
    if(f_x_min<0):
        f_x_min=0
    if(f_y_min<0):
        f_y_min=0

    f_x_max = np.max([maximum[0],maximum[2]])
    f_y_max = np.max([maximum[1],maximum[3]])
    
    # using these new maximum and minimum x and y values, extract objects from poster
    #(crop that specific portion from the poster)
    im=im[f_y_min:f_y_max,f_x_min:f_x_max]
    return im


for fn in image_glob:
    try:
        img_read = cv2.imread(fn)
        img_final = object_detector(img_read,SIZE)
        # if no object detected, use border cropped posters
        if len(img_final)==0:
            img_final=img_read[int(SIZE[0]/8):SIZE[0]-int(SIZE[0]/8),int(SIZE[1]/8):SIZE[1]-int(SIZE[1]/8)]
        cv2.imwrite(obj_loc+get_id(fn)+".jpg", img_final)
        img_dict[get_id(fn)] = img_final
    except:
        pass

In [43]:
# image preprocessing - resizing and normalization
import skimage
def preprocess(img, size=(150, 101, 3)):
    img = skimage.transform.resize(img, size)
    img = img.astype(np.float32)
    img = (img / 127.5) - 1.
    return img

In [44]:
# prepare the images for convolutional neural networks - generate label array for each poster using one hot encoding
# and preprocess the images 

def prepare_data(data, img_dict, label_dict, size=(150, 101, 3)):
    print("Generation dataset...")
    dataset = []
    y = []
    ids = []
    n_samples = len(img_dict)
    print("got {} posters".format(n_samples))
    for k in img_dict:
        #print(data['imdbId'].values)
        if int(k) in data["imdbId"].values:
            G = data[data["imdbId"] == int(k)]["Genre"].values
            #print(G)
            for g in G:
                g = g.split("|")
                
                img = preprocess(img_dict[k], size)
                if img.shape != (150, 101, 3):
                    continue
                l = np.sum([np.eye(n_classes, dtype="uint8")[label_dict["word2idx"][s]] for s in g if s !=''], axis=0)
                y.append(l)
                dataset.append(img)
                ids.append(k)
    print("DONE")
    print(len(dataset))
    return dataset, y, ids

In [None]:
# data, labels and ids generated for entire dataset

df_movietotal = df_movietotal[['Genre', 'imdbId', 'Title']]
dataset, y, ids =  prepare_data(df_movietotal, img_dict, label_dict, size=SIZE)

In [None]:
# data and labels for all 3 approaches saved - used in testing 

import pickle
dbfile_1=open('datasetFile_object','ab')
dbfile_2=open('yFile_object','ab')
pickle.dump(y,dbfile_2)
pickle.dump(dataset,dbfile_1)
dbfile_1.close()
dbfile_2.close()

## Training 

Training was done using High Performance Research Computing resources. The code is given below.

In [77]:
from sklearn.model_selection import train_test_split
data_train, data_test, y_train, y_test = train_test_split(dataset, y, test_size=0.10, random_state=42)

In [32]:
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
import warnings
warnings.filterwarnings("ignore")

In [37]:
# Convolutional Neural Network

# Sequential model used
model = Sequential()

# CONV => CONV => MAX POOL
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu',
                 input_shape=(SIZE[0], SIZE[1], 3)))
model.add(BatchNormalization())
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

# CONV => CONV => MAX POOL
model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

# FLATTEN => DENSE => DENSE
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(num_labels, activation='sigmoid'))

In [81]:
# binary crossentropy used for loss calculation
# Stochastic Gradient Descdent used for optimization

model.compile(loss='binary_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])


In [38]:
# hyperparameter tuning performed to determine optimal epochs and batch size for each approach

epochs = [50,75,100]
batches = [10,20]

for e in epochs:
    for b in batches:
        start = datetime.datetime.now()
        model.fit(np.array(data_train), np.array(y_train), batch_size=b, epochs=e, validation_split=0.1)
        end = datetime.datetime.now()
        
        # each model is saved - used later for testing
        model.save("model_"+str(e)+"_"+str(b)+".model")