In [2]:
import numpy as np
import pandas as pd
import string
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings



### Prepare dataframe
##### Reading the dataset

In [3]:
df = pd.read_csv("meta_clean.csv")
df.head(10)

Unnamed: 0,belongs_to_collection,genres,imdb_id,original_language,overview,production_countries,release_date,runtime,spoken_languages,title,Netflix,Hulu,Prime Video,Disney+
0,10194.0,"[16, 35, 10751]",tt0114709,en,"Led by Woody, Andy's toys live happily in his ...",['US'],1995-10-30,81.0,['en'],Toy Story,,,,
1,,"[12, 14, 10751]",tt0113497,en,When siblings Judy and Peter discover an encha...,['US'],1995-12-15,104.0,"['en', 'fr']",Jumanji,,,,
2,119050.0,"[10749, 35]",tt0113228,en,A family wedding reignites the ancient feud be...,['US'],1995-12-22,101.0,['en'],Grumpier Old Men,,,,
3,,"[35, 18, 10749]",tt0114885,en,"Cheated on, mistreated and stepped on, the wom...",['US'],1995-12-22,127.0,['en'],Waiting to Exhale,,,,
4,96871.0,[35],tt0113041,en,Just when George Banks has recovered from his ...,['US'],1995-02-10,106.0,['en'],Father of the Bride Part II,,,,
5,,"[28, 80, 18, 53]",tt0113277,en,"Obsessive master thief, Neil McCauley leads a ...",['US'],1995-12-15,170.0,"['en', 'es']",Heat,,,,
6,,"[35, 10749]",tt0114319,en,An ugly duckling having undergone a remarkable...,"['DE', 'US']",1995-12-15,127.0,"['fr', 'en']",Sabrina,,,,
7,,"[28, 12, 18, 10751]",tt0112302,en,"A mischievous young boy, Tom Sawyer, witnesses...",['US'],1995-12-22,97.0,"['en', 'de']",Tom and Huck,0.0,0.0,0.0,1.0
8,,"[28, 12, 53]",tt0114576,en,International action superstar Jean Claude Van...,['US'],1995-12-22,106.0,['en'],Sudden Death,,,,
9,645.0,"[12, 28, 53]",tt0113189,en,James Bond must unmask the mysterious head of ...,"['GB', 'US']",1995-11-16,130.0,"['en', 'ru', 'es']",GoldenEye,,,,


In [4]:
df.set_index("imdb_id", inplace=True)
print(df.info())
df.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 44413 entries, tt0114709 to tt6980792
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  4427 non-null   float64
 1   genres                 44413 non-null  object 
 2   original_language      44413 non-null  object 
 3   overview               44413 non-null  object 
 4   production_countries   44413 non-null  object 
 5   release_date           44413 non-null  object 
 6   runtime                44413 non-null  float64
 7   spoken_languages       44413 non-null  object 
 8   title                  44413 non-null  object 
 9   Netflix                2205 non-null   float64
 10  Hulu                   2205 non-null   float64
 11  Prime Video            2205 non-null   float64
 12  Disney+                2205 non-null   float64
dtypes: float64(6), object(7)
memory usage: 4.7+ MB
None


Unnamed: 0_level_0,belongs_to_collection,genres,original_language,overview,production_countries,release_date,runtime,spoken_languages,title,Netflix,Hulu,Prime Video,Disney+
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
tt0114709,10194.0,"[16, 35, 10751]",en,"Led by Woody, Andy's toys live happily in his ...",['US'],1995-10-30,81.0,['en'],Toy Story,,,,
tt0113497,,"[12, 14, 10751]",en,When siblings Judy and Peter discover an encha...,['US'],1995-12-15,104.0,"['en', 'fr']",Jumanji,,,,
tt0113228,119050.0,"[10749, 35]",en,A family wedding reignites the ancient feud be...,['US'],1995-12-22,101.0,['en'],Grumpier Old Men,,,,
tt0114885,,"[35, 18, 10749]",en,"Cheated on, mistreated and stepped on, the wom...",['US'],1995-12-22,127.0,['en'],Waiting to Exhale,,,,
tt0113041,96871.0,[35],en,Just when George Banks has recovered from his ...,['US'],1995-02-10,106.0,['en'],Father of the Bride Part II,,,,
tt0113277,,"[28, 80, 18, 53]",en,"Obsessive master thief, Neil McCauley leads a ...",['US'],1995-12-15,170.0,"['en', 'es']",Heat,,,,
tt0114319,,"[35, 10749]",en,An ugly duckling having undergone a remarkable...,"['DE', 'US']",1995-12-15,127.0,"['fr', 'en']",Sabrina,,,,
tt0112302,,"[28, 12, 18, 10751]",en,"A mischievous young boy, Tom Sawyer, witnesses...",['US'],1995-12-22,97.0,"['en', 'de']",Tom and Huck,0.0,0.0,0.0,1.0
tt0114576,,"[28, 12, 53]",en,International action superstar Jean Claude Van...,['US'],1995-12-22,106.0,['en'],Sudden Death,,,,
tt0113189,645.0,"[12, 28, 53]",en,James Bond must unmask the mysterious head of ...,"['GB', 'US']",1995-11-16,130.0,"['en', 'ru', 'es']",GoldenEye,,,,


In [5]:
df["release_date"] = pd.to_numeric(pd.to_datetime(df["release_date"]))
df.fillna({ "belongs_to_collection": -1 }, inplace=True)
df["belongs_to_collection"] = df["belongs_to_collection"].astype("category")

In [6]:
streaming_services = ["Netflix", "Hulu", "Prime Video", "Disney+"]

num_attribs = df.drop(columns=streaming_services).select_dtypes(include='number').columns.to_list()
cat_attribs = ["belongs_to_collection", "original_language"]
multi_cat_attribs = ["genres", "production_countries", "spoken_languages"]
paragraph_attribs = ["overview", "title"]

print("===== Attributes =====")
print("Numerical: ", num_attribs)
print("Single Categorical: ", cat_attribs)
print("Multi-valued Categorical: ", multi_cat_attribs)
print("Paragraph: ", paragraph_attribs)

===== Attributes =====
Numerical:  ['release_date', 'runtime']
Single Categorical:  ['belongs_to_collection', 'original_language']
Multi-valued Categorical:  ['genres', 'production_countries', 'spoken_languages']
Paragraph:  ['overview', 'title']


There are several entries where the overview is not null but are in whitespaces only, as shown below. Dropping them is necessary for the TfIdfVectorizer step.

In [7]:
for cat in paragraph_attribs:
    res = (df.loc[df[cat].str.isspace()])[cat]
    print(f"{res} results: {res.shape}")

imdb_id
tt0212517     
tt0098347     
tt0094076     
tt1309409     
tt0034886     
Name: overview, dtype: object results: (5,)
Series([], Name: title, dtype: object) results: (0,)


In [8]:
df.drop(df.loc[df["overview"].str.isspace()].index, inplace=True)
for cat in paragraph_attribs:
    res = (df.loc[df[cat].str.isspace()])[cat]
    print(f"{res} results: {res.shape}")

Series([], Name: overview, dtype: object) results: (0,)
Series([], Name: title, dtype: object) results: (0,)


##### Splitting by streaming services

In [9]:
all_df = df.drop(columns=streaming_services)
netflix_df = df[df["Netflix"] == 1].drop(columns=streaming_services)
hulu_df = df[df["Hulu"] == 1].drop(columns=streaming_services)
prime_df = df[df["Prime Video"] == 1].drop(columns=streaming_services)
disney_df = df[df["Disney+"] == 1].drop(columns=streaming_services)

print("===== Dataset Shapes =====")
print("All: ", all_df.shape)
print("Netflix: ", netflix_df.shape)
print("Hulu: ", hulu_df.shape)
print("Prime: ", prime_df.shape)
print("Disney+: ", disney_df.shape)

===== Dataset Shapes =====
All:  (44408, 9)
Netflix:  (600, 9)
Hulu:  (310, 9)
Prime:  (1062, 9)
Disney+:  (315, 9)


### Vectorization Pipeline

In [10]:
class AttributeSelector(BaseEstimator):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [11]:
class MultiClassEncoder(BaseEstimator):
    def __init__(self):
        super().__init__()
        self.mlb = CountVectorizer(analyzer=set)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.T
        res = []
        for col in X:
            res.append(self.mlb.fit_transform(col).toarray())
        X = np.hstack(res)
        return X

To reduce dimensionality (and save memory usage), frequency constraints were put in place for the TfIdfVectorizer.
- Filtering out stop words (e.g. "and", "is", "to") is a necessary step. However, the stop words collection provided by sklearn is known to produce inconsistent results, and tend to cut out more than is necessary, therefore it was not used. Instead, a max_df constraint was put in place to filter out words that appear in more than 80% of the strings.
- To remove words that are not likely to create matches between strings, a min_df constraint was put in place to filter out the words that appear in less than 2.5% of the strings.

Additionally, since it is not uncommon for movie titles to use punctuations in unconventional ways (e.g. for acronyms, playful censoring, dash for subtitles), a custom tokenizer was defined to allow for more flexibility in punctuation removal.

In [12]:
class PlotVectorizer(BaseEstimator):
    def __init__(self):
        super().__init__()
        self.tfidf = TfidfVectorizer(stop_words=None, tokenizer=self.tokenizer, max_df=0.8, min_df=0.025)

    # Need to define our own tokenizer to allow for special cases (e.g. "I.Q.")
    def tokenizer(self, text):
        return text.translate(string.punctuation).split(" ")

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.T
        res = []
        for col in X:
            res.append(self.tfidf.fit_transform(col).toarray())
        X = np.hstack(res)
        return X

In [13]:
num_pipeline = Pipeline([
    ('selector', AttributeSelector(num_attribs)),
    ('std_scaler', StandardScaler())
])

In [14]:
cat_pipeline = Pipeline([
    ('selector', AttributeSelector(cat_attribs)),
    ('one_hot', OneHotEncoder(min_frequency=0.01))
])

In [15]:
multi_cat_pipeline = Pipeline([
    ('selector', AttributeSelector(multi_cat_attribs)),
    ('mlb', MultiClassEncoder())
])

In [16]:
paragraph_pipeline = Pipeline([
    ('selector', AttributeSelector(paragraph_attribs)),
    ('plot_vectorize', PlotVectorizer())
])

In [17]:
preparation_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
    ('multi_cat_pipeline', multi_cat_pipeline),
    ('paragraph_pipeline', paragraph_pipeline)
])

In [18]:
all_prepared = preparation_pipeline.fit_transform(all_df)
netflix_prepared = preparation_pipeline.fit_transform(netflix_df)
hulu_prepared = preparation_pipeline.fit_transform(hulu_df)
prime_prepared = preparation_pipeline.fit_transform(prime_df)
disney_prepared = preparation_pipeline.fit_transform(disney_df)

print("All integrated", type(all_prepared), all_prepared.shape)
print("Netflix", type(netflix_prepared), netflix_prepared.shape)
print("Hulu", type(hulu_prepared), hulu_prepared.shape)
print("Prime Video", type(prime_prepared), prime_prepared.shape)
print("Disney+", type(disney_prepared), disney_prepared.shape)



All integrated <class 'scipy.sparse._csr.csr_matrix'> (44408, 267)
Netflix <class 'scipy.sparse._csr.csr_matrix'> (600, 272)
Hulu <class 'scipy.sparse._csr.csr_matrix'> (310, 271)
Prime Video <class 'scipy.sparse._csr.csr_matrix'> (1062, 262)
Disney+ <class 'scipy.sparse._csr.csr_matrix'> (315, 288)




### Predictor

In [19]:
def wrap_df(data, original_df):
    index = original_df.index
    new_df = pd.DataFrame.sparse.from_spmatrix(data)
    return new_df.set_index(index)

In [20]:
all_prepared_df = wrap_df(all_prepared, all_df)
netflix_prepared_df = wrap_df(netflix_prepared, netflix_df)
hulu_prepared_df = wrap_df(hulu_prepared, hulu_df)
prime_prepared_df = wrap_df(prime_prepared, prime_df)
disney_prepared_df = wrap_df(disney_prepared, disney_df)


all_prepared_df.head(10)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,257,258,259,260,261,262,263,264,265,266
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0114709,0.146503,-0.374551,0.0,1.0,0,1.0,0,0,0,0,...,0,0,0,0.0,0.0,0,0.0,0,0.0,0.0
tt0113497,0.151723,0.238843,1.0,0.0,0,1.0,0,0,0,0,...,0,0,0,0.25156,0.0,0,0.0,0,0.0,0.0
tt0113228,0.152517,0.158835,0.0,1.0,0,1.0,0,0,0,0,...,0,0,0,0.0,0.0,0,0.0,0,0.0,0.0
tt0114885,0.152517,0.852238,1.0,0.0,0,1.0,0,0,0,0,...,0,0,0,0.0,0.0,0,0.0,0,0.0,0.0
tt0113041,0.116773,0.292182,0.0,1.0,0,1.0,0,0,0,0,...,0,0,0,0.0,0.0,0,0.0,0,0.824769,0.565469
tt0113277,0.151723,1.999019,1.0,0.0,0,1.0,0,0,0,0,...,0,0,0,0.0,0.0,0,0.0,0,0.0,0.0
tt0114319,0.151723,0.852238,1.0,0.0,0,1.0,0,0,0,0,...,0,0,0,0.0,0.0,0,0.0,0,0.0,0.0
tt0112302,0.152517,0.052158,1.0,0.0,0,1.0,0,0,0,0,...,0,0,0,0.0,0.184734,0,1.0,0,0.0,0.0
tt0114576,0.152517,0.292182,1.0,0.0,0,1.0,0,0,0,0,...,0,0,0,0.0,0.0,0,0.0,0,0.0,0.0
tt0113189,0.148432,0.932246,0.0,1.0,0,1.0,0,0,0,0,...,0,0,0,0.0,0.0,0,0.0,0,0.0,0.0


In [21]:
class Recommender(BaseEstimator):
    def __init__(self, norm = cosine_similarity, num_recommendations = 10):
        super().__init__()
        self.norm = norm
        self.num_recommendations = num_recommendations

    def fit(self, X, y=None):
        index = X.index
        self.kernel = pd.DataFrame(self.norm(X, X), index=index, columns=index)
        return self
    
    def predict(self, X, y=None):
        #? Need to test multiple values
        sim_scores = self.kernel.loc[X.index.to_list()]
        print(sim_scores)
        sim_scores = sorted(sim_scores, key=lambda e: e[1], reverse=True)
        return sim_scores[1:self.num_recommendations + 1]

### Test Distance Metrics

In [23]:
metrics = {"cosine_similarity": cosine_similarity}


In [None]:
for metric in metrics.items():
    predictions = Recommender(metric[1]).fit(all_prepared_df)
    #? Need to test
    # average_similarity_score = sum(predictions) / len(predictions)
    # print(f"Average {metric[0]}: {average_similarity_score}")


By inspection, we choose {{}} as our distance metric.

In [None]:
metric = cosine_similarity
#! Need change