# Analyse IMDB Movie Data for Title Segmentation & Reccomendation
[blog title](blog link)

## Introduction
sdjafkldsjfkl

## Dependencies
dfjakdslfjdskl

In [None]:
pip install pandas

In [None]:
pip install boto3

In [None]:
pip install sagemaker

In [None]:
pip install mxnet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import requests
import io
import gzip
import mxnet as mx
import os
import boto3
import sagemaker

from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sagemaker import get_execution_role
from sagemaker import PCA
from sagemaker import KMeans

%matplotlib inline
plt.style.use('seaborn')
warnings.filterwarnings('ignore')

## Data Loading
Load the basic title info, alternate title info and title rating info into data frames

In [None]:
def remoteImdbToDf(dataset):
    url="https://datasets.imdbws.com/" + dataset + ".tsv.gz"
    content=requests.get(url).content
    open(dataset + '.tsv.gz', 'wb').write(content)
    
    with gzip.open(dataset + '.tsv.gz', 'rb') as read_file:
        file_content = read_file.read()
        write_file = open(dataset + '.tsv', 'wb')
        write_file.write(file_content)
        write_file.close()
    
    return pd.read_csv(dataset + '.tsv', sep='\t')
    

In [None]:
df_titleakas = remoteImdbToDf('title.akas')

In [None]:
df_titlebasics = remoteImdbToDf('title.basics')

In [None]:
df_titleratings = remoteImdbToDf('title.ratings')

## Exploratory data analysis EDA – Data cleaning and exploration

In [None]:
df_titlebasics.info()

In [None]:
# make the column names lower case to make it easier to work with
df_titlebasics.columns = ['titleid','type','title','originaltitle','isadult','startyear','endyear','length','genres']

In [None]:
df_titlebasics.head()

In [None]:
# we're probably only going to want titles that are movies for the recommendation, let's check this field does that
df_titlebasics[df_titlebasics['type']=='movie'].sample(5)

In [None]:
# make the column names lower case to make it easier to work with
df_titleakas.columns = ['titleid','ordering','title','region','language','types','attributes','isoriginaltitle']

In [None]:
df_titleakas.sample(5)

In [None]:
# merge the basic and alternate info data frames on the title id field
df_titlesfull = pd.merge(left=df_titlebasics, right=df_titleakas, left_on='titleid', right_on='titleid')

In [None]:
# we don't need this data anymore, all required info is in the new data frame
del df_titlebasics
del df_titleakas

In [None]:
df_titlesfull.head(15)

In [None]:
df_titleratings.info()

In [None]:
# make the column names lower case to make it easier to work with
df_titleratings.columns = ['titleid', 'averagerating', 'numvotes']

In [None]:
plt.hist(df_titleratings['averagerating'], edgecolor='black')

In [None]:
df_titleratings.sample(5)

In [None]:
# merge the title info and ratings data frames on the title id field
df_titles = pd.merge(left=df_titlesfull, right=df_titleratings, left_on='titleid', right_on='titleid')

In [None]:
# we don't need this data anymore, all required info is in the new data frame
del df_titleratings
del df_titlesfull

In [None]:
df_titles.head(15)

In [None]:
df_titles.shape

In [None]:
# the titles get duplicated if it's moved language/region, etc. Let's only consider original titles to remove bias
df_titles = df_titles.drop_duplicates(subset=['titleid'])

In [None]:
df_titles.head(15)

In [None]:
df_titles.shape

In [None]:
# now let's get only movies
df_titles = df_titles[df_titles['type']=='movie']

In [None]:
df_titles.shape

In [None]:
df_titles.head()

In [None]:
# let's drop some of the columns that won't contribute much to the clustering
#  (reconsider region later if needed, obviously need to one-hot-encode it and remove \N values)
df_titles.drop(['originaltitle','ordering','title_y','type',
                'types','region','types','attributes',
                'isoriginaltitle','endyear'],axis=1,inplace=True)

In [None]:
df_titles.shape

In [None]:
df_titles.head()

In [None]:
# make the column names lower case to make it easier to work with, rename some columns due to merge naming
df_titles.columns = ['titleid', 'title', 'isadult',
                     'year','length','genres',
                     'language','averagerating',
                     'numvotes']

In [None]:
df_titles.head()

In [None]:
df_titles.info()

In [None]:
# let's not consider any titles that don't have a valid year value
df_titles = df_titles[(df_titles['year']!='\\N')]

In [None]:
#  convert the year value (string) to integer for easy scaling and comparison
df_titles['year'] = df_titles['year'].astype(int)

In [None]:
plt.bar(df_titles.year.unique(),
        df_titles.year.value_counts().sort_index())

In [None]:
plt.figure(figsize = (10,8))
sns.scatterplot(x = df_titles['numvotes'], y = df_titles['averagerating'])
plt.xlabel('number of votes')
plt.ylabel('average rating of movie')

In [None]:
plt.figure(figsize = (10,8))
sns.scatterplot(x = df_titles['year'], y = df_titles['numvotes'])
plt.ylabel('number of votes')
plt.xlabel('year')

## Data modelling

The 'genres' column contains non-numerical comma-separated-values for zero, one or multiple genres for the title. To make this value comparible (and useful for clustering), this needs to be made into a numerical (0 or 1) for each value. To do this, we need to use a process called 'one-hot-encoding' 

In [None]:
df_titles.genres.unique()

In [None]:
df_titles.shape

In [None]:
# let's convert the csv column to a pandas list object in a new column
df_titles['genres_list'] = df_titles.genres.str.split(',').tolist()
df_titles.head()

In [None]:
df_titles.shape

In [None]:
# get the one hot encoded values for genre. 
# (this table is relatively sparse)
genres_one_hot_encoded = df_titles.genres_list.str.join('|').str.get_dummies().add_prefix('genre_')
genres_one_hot_encoded.head()

In [None]:
# let's add these new columns to the original full data
df_titles = pd.concat([df_titles, genres_one_hot_encoded], axis=1, sort=False)

In [None]:
# get the one hot encoded values for language. 
language_one_hot_encoded = pd.get_dummies(df_titles.language, prefix='language')
language_one_hot_encoded.head()

In [None]:
# let's add these new columns to the original full data
df_titles = pd.concat([df_titles, language_one_hot_encoded], axis=1, sort=False)

In [None]:
# now we can drop the descriptive value columns, and the one-hot-encoded columns for null values
df_titles.drop(['genres','language','genres_list','genre_\\N','language_\\N'],axis=1,inplace=True)

In [None]:
df_titles.titleid.unique().shape

In [None]:
df_titles.titleid.shape

In [None]:
df_titles.title.unique().shape

In [None]:
df_titles.title.shape

In [None]:
# We can now set the ‘titleid and title’ 
#   as the index and the rest of the numerical 
#   features become the attributes of each unique title.
df_titles.index=df_titles['titleid'] + " " + df_titles['title']
df_titles.drop(['titleid','title'],axis=1,inplace=True)

In [None]:
# Let's get rid of titles where the length value isn't available
df_titles = df_titles[df_titles['length'] != '\\N']

# and conver the length to integer for scale/comparison ease
df_titles['length'] = df_titles['length'].astype(int)

In [None]:
# Remove some really short/long titles, lowly rated, and old movies
df_titles = df_titles[
    (df_titles['length'] > 30) & 
    (df_titles['length'] < 360) &  
    (df_titles['numvotes'] > 50000) &
    (df_titles['year'] > 1970) &
    (df_titles['averagerating'] > 6.0)
]

### Feature engineering

In [None]:
# Data Scaling – We need to standardize the scaling of the numerical columns 
#   in order to use any distance based analytical methods so that we can 
#   compare the relative distances between different feature columns. We can 
#   use minmaxscaler to transform the numerical columns so that they also 
#   fall between 0 and 1.
scaler=MinMaxScaler()
df_titles_scaled=pd.DataFrame(scaler.fit_transform(df_titles))
df_titles_scaled.columns=df_titles.columns
df_titles_scaled.index=df_titles.index

In [None]:
df_titles_scaled.describe()

In [None]:
df_titles_scaled.shape

In [None]:
# Get the current service/execution role (ensure it has Sagemaker execute permissions)
role = get_execution_role()

In [None]:
# Update this value if you need
bucket_name='simontest-2020-10-24'
num_components=95

pca_SM = PCA(role=role,
          train_instance_count=1,
          train_instance_type='ml.c4.xlarge',
          output_path='s3://'+ bucket_name +'/titles/',
            num_components=num_components)

In [None]:
train_data = df_titles_scaled.values.astype('float32')

In [None]:
%%time
pca_SM.fit(pca_SM.record_set(train_data))

In [None]:
job_name=pca_SM._current_job_name
model_key = "titles/" + job_name + "/output/model.tar.gz"

boto3.resource('s3').Bucket(bucket_name).download_file(model_key, 'model.tar.gz')
os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

In [None]:
pca_model_params = mx.ndarray.load('model_algo-1')

In [None]:
s=pd.DataFrame(pca_model_params['s'].asnumpy())
v=pd.DataFrame(pca_model_params['v'].asnumpy())

In [None]:
s.iloc[75:,:].apply(lambda x: x*x).sum()/s.apply(lambda x: x*x).sum()

In [None]:
s_20=s.iloc[75:,:]
v_20=v.iloc[:,75:]
v_20.columns=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
             15,16,17,18,19]

In [None]:
component_num=18

first_comp = v_20[20-component_num]
comps = pd.DataFrame(list(zip(first_comp, df_titles_scaled.columns)), columns=['weights', 'features'])
comps['abs_weights']=comps['weights'].apply(lambda x: np.abs(x))
ax=sns.barplot(data=comps.sort_values('abs_weights', ascending=False).head(10), x="weights", y="features", palette="Blues_d")
ax.set_title("PCA Component Makeup: #" + str(component_num))
plt.show()

In [None]:
#Will need to review this as the data will change over time
PCA_list=['Romance/Drama', 'Crime/Action/Thriller', 'Adventure/Drama', 'Horror/Thriller',
          'Action/Romance', 'Sci-Fi/Horror/Mystery', 'Sci-Fi/Mistery', 'Romance/Mystery/Crime', 'Mystery/Drama/Comedy',
          'Sci-Fi/Fantasy', 'EN-Lang/Mystery/Fantasy', 'Foreign/Older/Mystery', 'Recent/Bio/Adventure', 
          'Older/EN-Lang/Adventure', 'Popular/Fantasy/Animation', 'Older/Bio/Animation', 'TR-Lang/Popular/Recent',
          'FR-Lang/Popular', 'JA-Lang/Music/Sport', 'JA-Lang/FR-Lang/Popular']

In [None]:
%%time
pca_predictor = pca_SM.deploy(initial_instance_count=1, 
                                 instance_type='ml.t2.medium')

In [None]:
%%time
result = pca_predictor.predict(train_data)
df_titles_transformed=pd.DataFrame()
for a in result:
    b=a.label['projection'].float32_tensor.values
    df_titles_transformed=df_titles_transformed.append([list(b)])
df_titles_transformed.index=df_titles_scaled.index
df_titles_transformed=df_titles_transformed.iloc[:,75:]
df_titles_transformed.columns=PCA_list

In [None]:
df_titles_transformed.head()

In [None]:
train_data = df_titles_transformed.values.astype('float32')

In [None]:
#If the number of clusters is high, hopefully we get better recommendations
num_clusters = 25
kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.c4.xlarge',
                output_path='s3://'+ bucket_name +'/titles/',              
                k=num_clusters)

In [None]:
%%time
kmeans.fit(kmeans.record_set(train_data))

In [None]:
%%time
kmeans_predictor = kmeans.deploy(initial_instance_count=1, 
                                 instance_type='ml.t2.medium')

In [None]:
%%time
result=kmeans_predictor.predict(train_data)

In [None]:
cluster_labels = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]
pd.DataFrame(cluster_labels)[0].value_counts()

In [None]:
ax=plt.subplots(figsize=(6,3))
ax=sns.distplot(cluster_labels, kde=False)
title="Histogram of Cluster Counts"
ax.set_title(title, fontsize=12)
plt.show()

## Drawing conclusions from our modelling

In [None]:
#job_name='<your_SageMaker_KMeans_job_name_here>'
job_name=kmeans._current_job_name

model_key = "titles/" + job_name + "/output/model.tar.gz"

boto3.resource('s3').Bucket(bucket_name).download_file(model_key, 'model.tar.gz')
os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

Kmeans_model_params = mx.ndarray.load('model_algo-1')

In [None]:
cluster_centroids=pd.DataFrame(Kmeans_model_params[0].asnumpy())
cluster_centroids.columns=df_titles_transformed.columns
cluster_centroids

In [None]:
plt.figure(figsize = (16, 6))
ax = sns.heatmap(cluster_centroids.T, cmap = 'YlGnBu')
ax.set_xlabel("Cluster")
plt.yticks(fontsize = 16)
plt.xticks(fontsize = 16)
ax.set_title("Attribute Value by Centroid")
plt.show()

In [None]:
df_titles_transformed['labels']=list(map(int, cluster_labels))
df_titles_transformed.head()

In [None]:
cluster=df_titles_transformed[df_titles_transformed['labels']==20]
cluster.sample(10)

In [None]:
df_titles_transformed

In [None]:
sagemaker.Session().delete_endpoint(pca_predictor.endpoint)
sagemaker.Session().delete_endpoint(kmeans_predictor.endpoint)