# Import the necessary libraries

In [15]:
from babelfish import *
from subliminal import *
from imdb import IMDb
import findspark
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.context import SparkContext
import pysrt
import pickle

sc = SparkContext.getOrCreate()

# Mapping functions for extracting the age rating

These maps are used to map the movie string to the Swedish rating.

In [3]:
def movie_name_to_id(movie):
    from imdb import IMDb
    ia = IMDb()
    return ia.search_movie(movie)[0].getID()
    
def id_to_parents_guide(movie_id):
    from imdb import IMDb
    ia = IMDb()
    return ia.get_movie_parents_guide(movie_id)['data']['certification']

def get_swedish_rating(ratings):
    for rating in ratings:
        if "Sweden" in rating:
            return rating.split(':')[1]
    return -1

# Map movie name to subtitle vector

These maps are used to map the movie name to vectors used in the training model

In [None]:
def get_movie_for_subtitle(movie_name):
    video = Video.fromname(movie_name)
    return video
    
def map_to_subtitle(video):
    subtitles = list_subtitles([video], {Language('eng')})
    subtitle = subtitles[video][0]
    download_subtitles([subtitle])
    return subtitle.text

def trim_srt(subtitle_text):
    import pysrt
    return pysrt.from_string(subtitle_text).text

def sub_to_vec(subtitle):
    from sklearn.feature_extraction.text import HashingVectorizer
    vectorizer = HashingVectorizer(n_features=2**4)
    X = vectorizer.fit_transform([subtitle])
    return X

# Mapping procedure

The movies in the `movie_list` array are used for gathering training data. Some movies are not available in the API and they might result in an error in that case another movie name should be used.

In [6]:
movie_list = ['matrix', 'lion king', 'pulp fiction', 'titanic', 'toy story', 'deadpool', 'bad boys']

ia = IMDb()

movie_list_parallel = sc.parallelize(movie_list)
movie_ids = movie_list_parallel.map(lambda x: (movie_name_to_id(x), get_movie_for_subtitle(x)))
movie_parent_guides = movie_ids.map(lambda x: (id_to_parents_guide(x[0]), x[1]))
sweden_rating = movie_parent_guides.map(lambda x: (get_swedish_rating(x[0]), map_to_subtitle(x[1])))
sweden_rating_clean = sweden_rating.map(lambda x: (x[0], trim_srt(x[1])))
sweden_rating_vectorized = sweden_rating_clean.map(lambda x: (x[0], sub_to_vec(x[1])))

# Data transformation

Collecting the data in the master row and transforming it to data for the model

In [9]:
sweden_rating_vectorized.persist()
data = sweden_rating_vectorized.collect()

X = []
y = []
for row in data:
    X.append(row[1].toarray()[0])
    y.append(int(row[0]))

# Model training

In this section we train a simple `RandomForestRegressor` using the data collected in the previous steps. 

It sould be noted that the training part is not the main part of the project and the model can be provided from any external source. The current model might not be the most optimized model for this data.

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

regr = RandomForestRegressor(max_depth=2, random_state=0,
                              n_estimators=100)
regr.fit(X, y)  
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

# Example

A simple example to test the model

In [13]:
test_str = 'uncle'

vector = sub_to_vec(test_str).toarray()[0]

print(regr.predict(vector.reshape(1, -1)))

[11.92]


# Save the model

In this cell we save the model to be used for the next part of the project.

In [16]:
filename = 'model.sav'
pickle.dump(regr, open(filename, 'wb'))