In [3]:
import pandas as pd
import numpy as np
#download image from url
import urllib.request
#multi-label hot encoder
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt
import os

In [5]:
#collect movie data set
df = pd.read_csv('MovieGenre.csv', encoding = "ISO-8859-1")
df = df[['imdbId', 'Title','Genre', 'Poster']]
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40108 entries, 0 to 40107
Data columns (total 4 columns):
imdbId    40108 non-null int64
Title     40108 non-null object
Genre     39963 non-null object
Poster    39383 non-null object
dtypes: int64(1), object(3)
memory usage: 1.2+ MB


In [8]:
#remove any data that has 'Nan'
df_preprocessed = df.dropna()
df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39263 entries, 0 to 40106
Data columns (total 4 columns):
imdbId    39263 non-null int64
Title     39263 non-null object
Genre     39263 non-null object
Poster    39263 non-null object
dtypes: int64(1), object(3)
memory usage: 1.5+ MB


In [9]:
#Collect Poster images from URL link
#input: poster links. outputs: index value of invalid link-no poster generated.
def link_to_poster (posters):
    counter = 0
    missing_item = []
    for i in range(len(posters)):
        try:
            urllib.request.urlretrieve(str(posters[i]), str(i) + ".jpg")
        except:
            counter += 1
            missing_item.append(i)
            continue
    
    print("Missing poster: " + str(counter))
    return missing_item


In [14]:
#Sample of preprocessed dataset after removing a broken link, which is at index 7
example = df_preprocessed[:10]
posters = example['Poster']
titles = example['Title']
missing_item = link_to_poster(posters)
example.drop(labels=missing_item)

Missing poster: 1


Unnamed: 0,imdbId,Title,Genre,Poster
0,114709,Toy Story (1995),Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,Jumanji (1995),Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,Grumpier Old Men (1995),Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,Waiting to Exhale (1995),Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,Father of the Bride Part II (1995),Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...
5,113277,Heat (1995),Action|Crime|Drama,https://images-na.ssl-images-amazon.com/images...
6,114319,Sabrina (1995),Comedy|Drama,https://images-na.ssl-images-amazon.com/images...
8,114576,Sudden Death (1995),Action|Crime|Thriller,https://images-na.ssl-images-amazon.com/images...
9,113189,GoldenEye (1995),Action|Adventure|Thriller,https://images-na.ssl-images-amazon.com/images...


In [18]:
#Sample of preprocessed data with labels attatched
f = lambda x: np.char.split(x, sep='|')
y = example['Genre'].apply(f)
# Create MultiLabelBinarizer object
one_hot = MultiLabelBinarizer()
# One-hot encode data
label = one_hot.fit_transform(y)
column_label = one_hot.classes_
df_multi_encoder = pd.DataFrame(label,columns=column_label)
pd.concat([example, df_multi_encoder], axis = 1)

Unnamed: 0,imdbId,Title,Genre,Poster,Action,Adventure,Animation,Comedy,Crime,Drama,Family,Romance,Thriller
0,114709,Toy Story (1995),Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...,0,1,1,1,0,0,0,0,0
1,113497,Jumanji (1995),Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...,1,1,0,0,0,0,1,0,0
2,113228,Grumpier Old Men (1995),Comedy|Romance,https://images-na.ssl-images-amazon.com/images...,0,0,0,1,0,0,0,1,0
3,114885,Waiting to Exhale (1995),Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...,0,0,0,1,0,1,0,1,0
4,113041,Father of the Bride Part II (1995),Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...,0,0,0,1,0,0,1,1,0
5,113277,Heat (1995),Action|Crime|Drama,https://images-na.ssl-images-amazon.com/images...,1,0,0,0,1,1,0,0,0
6,114319,Sabrina (1995),Comedy|Drama,https://images-na.ssl-images-amazon.com/images...,0,0,0,1,0,1,0,0,0
7,112302,Tom and Huck (1995),Adventure|Comedy|Drama,https://images-na.ssl-images-amazon.com/images...,0,1,0,1,0,1,0,0,0
8,114576,Sudden Death (1995),Action|Crime|Thriller,https://images-na.ssl-images-amazon.com/images...,1,0,0,0,1,0,0,0,1
9,113189,GoldenEye (1995),Action|Adventure|Thriller,https://images-na.ssl-images-amazon.com/images...,1,1,0,0,0,0,0,0,1


In [None]:
#Import poster images of the full data set
#Full = link_to_poster(df_preprocessed['Poster'])

In [22]:
cd PosterImages/

/Users/Sam/Desktop/CS230 Project/PosterImages


In [23]:
#Create a dataframe of the full movie with valid posters

files = os.listdir()
poster_list = []
for file in files:
    if file.endswith('.jpg'):
        poster_list.append(int(file[:-4]))
df_full = df_preprocessed.ix[poster_list].sort_index()
df_full

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,imdbId,Title,Genre,Poster
0,114709,Toy Story (1995),Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,Jumanji (1995),Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,Grumpier Old Men (1995),Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,Waiting to Exhale (1995),Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,Father of the Bride Part II (1995),Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...
...,...,...,...,...
39258,960143,Love's Unfolding Dream (2007),Drama|Family|Western,https://images-na.ssl-images-amazon.com/images...
39259,1269560,Love Takes Wing (2009),Family|Western,https://images-na.ssl-images-amazon.com/images...
39260,1307064,Love Finds a Home (2009),Drama|Family|Western,https://images-na.ssl-images-amazon.com/images...
39261,1684907,Love Begins (2011),Family|Western,https://images-na.ssl-images-amazon.com/images...


In [25]:
# Labels of the full data with valid poster link
genres = df_full['Genre']
poster_genre = genres.apply(f)
# Create MultiLabelBinarizer object
one_hot = MultiLabelBinarizer()
# One-hot encode data
label = one_hot.fit_transform(poster_genre)
column_label = one_hot.classes_
df_multi_encoder = pd.DataFrame(label,columns=column_label)
df_multi_encoder

Unnamed: 0,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36893,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
36894,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
36895,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
36896,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [21]:
column_label

array(['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport',
       'Talk-Show', 'Thriller', 'War', 'Western'], dtype=object)

In [23]:
df_multi_encoder.sum()

Action          4976
Adult              8
Adventure       3594
Animation       1576
Biography       1864
Comedy         11797
Crime           4935
Documentary     3419
Drama          18699
Family          1912
Fantasy         1893
Film-Noir        381
Game-Show          1
History         1330
Horror          3752
Music           1208
Musical          775
Mystery         2237
News              77
Reality-TV         2
Romance         5824
Sci-Fi          1870
Short            843
Sport            653
Talk-Show          4
Thriller        4506
War             1107
Western          790
dtype: int64

In [26]:
# Label with top 15 movie genres
top_genre = ['Action', 'Adventure', 'Biography', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller']
df_multi_encoder = df_multi_encoder[top_genre]
df_multi_encoder

Unnamed: 0,Action,Adventure,Biography,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36893,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
36894,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
36895,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
36896,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [27]:
# Save multi encoder to csv file 
#df_multi_encoder.to_csv('Movie_Lables.csv')