# Recommendation System with Similarity Function

In [1]:
import pandas as pd
import numpy as np

In [2]:
movie_rating_df = pd.read_csv('datasets/movie_rating_df.csv')
actor = pd.read_csv('datasets/actor_name.csv')
director_writers = pd.read_csv('datasets/directors_writers.csv')

In [3]:
movie_rating_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894.0,,1.0,"Documentary,Short",5.6,1608
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5.0,"Animation,Short",6.0,197
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892.0,,4.0,"Animation,Comedy,Romance",6.5,1285
3,tt0000004,short,Un bon bock,Un bon bock,0,1892.0,,12.0,"Animation,Short",6.1,121
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1.0,"Comedy,Short",6.1,2050


In [4]:
movie_rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751614 entries, 0 to 751613
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          751614 non-null  object 
 1   titleType       751614 non-null  object 
 2   primaryTitle    751614 non-null  object 
 3   originalTitle   751614 non-null  object 
 4   isAdult         751614 non-null  int64  
 5   startYear       751614 non-null  float64
 6   endYear         16072 non-null   float64
 7   runtimeMinutes  751614 non-null  float64
 8   genres          486766 non-null  object 
 9   averageRating   751614 non-null  float64
 10  numVotes        751614 non-null  int64  
dtypes: float64(4), int64(2), object(5)
memory usage: 63.1+ MB


In [5]:
actor.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm1774132,Nathan McLaughlin,1973,\N,"special_effects,make_up_department","tt0417686,tt1713976,tt1891860,tt0454839"
1,nm10683464,Bridge Andrew,\N,\N,actor,tt7718088
2,nm1021485,Brandon Fransvaag,\N,\N,miscellaneous,tt0168790
3,nm6940929,Erwin van der Lely,\N,\N,miscellaneous,tt4232168
4,nm5764974,Svetlana Shypitsyna,\N,\N,actress,tt3014168


In [6]:
actor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   nconst             1000 non-null   object
 1   primaryName        1000 non-null   object
 2   birthYear          1000 non-null   object
 3   deathYear          1000 non-null   object
 4   primaryProfession  891 non-null    object
 5   knownForTitles     1000 non-null   object
dtypes: object(6)
memory usage: 47.0+ KB


In [7]:
director_writers.head()

Unnamed: 0,tconst,director_name,writer_name
0,tt0011414,David Kirkland,"John Emerson,Anita Loos"
1,tt0011890,Roy William Neill,"Arthur F. Goodrich,Burns Mantle,Mary Murillo"
2,tt0014341,"Buster Keaton,John G. Blystone","Jean C. Havez,Clyde Bruckman,Joseph A. Mitchell"
3,tt0018054,Cecil B. DeMille,Jeanie Macpherson
4,tt0024151,James Cruze,"Max Miller,Wells Root,Jack Jevne"


In [8]:
director_writers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 986 entries, 0 to 985
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tconst         986 non-null    object
 1   director_name  986 non-null    object
 2   writer_name    986 non-null    object
dtypes: object(3)
memory usage: 23.2+ KB


## Data Processing

In [9]:
# Transform director_name and writer_name into list
director_writers['director_name'] = director_writers['director_name'].apply(lambda row: row.split(','))
director_writers['writer_name'] = director_writers['writer_name'].apply(lambda row: row.split(','))

In [10]:
# We only need columns nconst, primaryName, and knownForTitles
actor = actor[['nconst','primaryName','knownForTitles']]

In [11]:
# Checking variation
print(actor['knownForTitles'].apply(lambda x: len(x.split(','))).unique())

# Transform knownForTitles to list of list
actor['knownForTitles'] = actor['knownForTitles'].apply(lambda x: x.split(','))

[4 1 2 3]


In [12]:
actor.head()

Unnamed: 0,nconst,primaryName,knownForTitles
0,nm1774132,Nathan McLaughlin,"[tt0417686, tt1713976, tt1891860, tt0454839]"
1,nm10683464,Bridge Andrew,[tt7718088]
2,nm1021485,Brandon Fransvaag,[tt0168790]
3,nm6940929,Erwin van der Lely,[tt4232168]
4,nm5764974,Svetlana Shypitsyna,[tt3014168]


### 1-to-1 Correspondence 

In [13]:
df_uni = []

for x in ['knownForTitles']:
    #repeats the index of each row until each element of knownForTitles
    idx = actor.index.repeat(actor['knownForTitles'].str.len())
   
   #splits the values from the list in each row and combines them with other rows into a dataframe
    df1 = pd.DataFrame({
        x: np.concatenate(actor[x].values)
    })
    
    df1.index = idx
    df_uni.append(df1)
    
df_concat = pd.concat(df_uni, axis=1)
unnested_df = df_concat.join(actor.drop(['knownForTitles'], 1), how='left')
unnested_df = unnested_df[actor.columns.tolist()]

  unnested_df = df_concat.join(actor.drop(['knownForTitles'], 1), how='left')


## Nesting primaryName group by knownForTitles

In [14]:
unnested_drop = unnested_df.drop(['nconst'], axis=1)

df_uni = []

for col in ['primaryName']:
    #PrimaryName column aggregation according to group_col defined above
    dfi = unnested_drop.groupby(['knownForTitles'])[col].apply(list)
    df_uni.append(dfi)
df_grouped = pd.concat(df_uni, axis=1).reset_index()
df_grouped.columns = ['knownForTitles','cast_name']
df_grouped

Unnamed: 0,knownForTitles,cast_name
0,tt0008125,[Charles Harley]
1,tt0009706,[Charles Harley]
2,tt0010304,[Natalie Talmadge]
3,tt0011414,[Natalie Talmadge]
4,tt0011890,[Natalie Talmadge]
...,...,...
1893,tt9610496,[Stefano Baffetti]
1894,tt9714030,[Kevin Kain]
1895,tt9741820,[Caroline Plyler]
1896,tt9759814,[Ethan Francis]


## Joining with Movie Table

In [15]:
#join between movie table and actor table
base_df = pd.merge(df_grouped, movie_rating_df, left_on='knownForTitles', right_on='tconst', how='inner')

#join between base_df and director_writer table
base_df = pd.merge(base_df, director_writers, left_on='tconst', right_on='tconst', how='left')

base_df.head()

Unnamed: 0,knownForTitles,cast_name,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,director_name,writer_name
0,tt0011414,[Natalie Talmadge],tt0011414,movie,The Love Expert,The Love Expert,0,1920.0,,60.0,"Comedy,Romance",4.9,136,[David Kirkland],"[John Emerson, Anita Loos]"
1,tt0011890,[Natalie Talmadge],tt0011890,movie,Yes or No,Yes or No,0,1920.0,,72.0,,6.3,7,[Roy William Neill],"[Arthur F. Goodrich, Burns Mantle, Mary Murillo]"
2,tt0014341,[Natalie Talmadge],tt0014341,movie,Our Hospitality,Our Hospitality,0,1923.0,,65.0,"Comedy,Romance,Thriller",7.8,9621,"[Buster Keaton, John G. Blystone]","[Jean C. Havez, Clyde Bruckman, Joseph A. Mitc..."
3,tt0018054,[Reeka Roberts],tt0018054,movie,The King of Kings,The King of Kings,0,1927.0,,155.0,"Biography,Drama,History",7.3,1826,[Cecil B. DeMille],[Jeanie Macpherson]
4,tt0024151,[James Hackett],tt0024151,movie,I Cover the Waterfront,I Cover the Waterfront,0,1933.0,,80.0,"Drama,Romance",6.3,455,[James Cruze],"[Max Miller, Wells Root, Jack Jevne]"


In [16]:
#drop column knownForTitles
base_drop = base_df.drop(['knownForTitles'], axis=1)

#change NULL value in column genres with 'Unknown'
base_drop['genres'] = base_drop['genres'].fillna('unknown')

base_drop.isna().sum()

cast_name           0
tconst              0
titleType           0
primaryTitle        0
originalTitle       0
isAdult             0
startYear           0
endYear           950
runtimeMinutes      0
genres              0
averageRating       0
numVotes            0
director_name      74
writer_name        74
dtype: int64

In [17]:
#change NULL value in columns director_name and writer_name with 'Unknown'
base_drop[['director_name','writer_name']] = base_drop[['director_name','writer_name']].fillna('unknown')

#transform genres column value to list of list
base_drop['genres'] = base_drop['genres'].apply(lambda x: x.split(','))

In [18]:
base_drop2 = base_drop.drop(['tconst','isAdult','endYear','originalTitle'], axis=1)
base_drop2 = base_drop2[['primaryTitle','titleType','startYear','runtimeMinutes','genres','averageRating','numVotes','cast_name','director_name','writer_name']]
base_drop2.columns = ['title','type','start','duration','genres','rating','votes','cast_name','director_name','writer_name']
base_drop2.head()

Unnamed: 0,title,type,start,duration,genres,rating,votes,cast_name,director_name,writer_name
0,The Love Expert,movie,1920.0,60.0,"[Comedy, Romance]",4.9,136,[Natalie Talmadge],[David Kirkland],"[John Emerson, Anita Loos]"
1,Yes or No,movie,1920.0,72.0,[unknown],6.3,7,[Natalie Talmadge],[Roy William Neill],"[Arthur F. Goodrich, Burns Mantle, Mary Murillo]"
2,Our Hospitality,movie,1923.0,65.0,"[Comedy, Romance, Thriller]",7.8,9621,[Natalie Talmadge],"[Buster Keaton, John G. Blystone]","[Jean C. Havez, Clyde Bruckman, Joseph A. Mitc..."
3,The King of Kings,movie,1927.0,155.0,"[Biography, Drama, History]",7.3,1826,[Reeka Roberts],[Cecil B. DeMille],[Jeanie Macpherson]
4,I Cover the Waterfront,movie,1933.0,80.0,"[Drama, Romance]",6.3,455,[James Hackett],[James Cruze],"[Max Miller, Wells Root, Jack Jevne]"


## Creating Content-based Recommender System

In [19]:
#Metadata
feature_df = base_drop2[['title','cast_name','genres','director_name','writer_name']]
feature_df.head()

Unnamed: 0,title,cast_name,genres,director_name,writer_name
0,The Love Expert,[Natalie Talmadge],"[Comedy, Romance]",[David Kirkland],"[John Emerson, Anita Loos]"
1,Yes or No,[Natalie Talmadge],[unknown],[Roy William Neill],"[Arthur F. Goodrich, Burns Mantle, Mary Murillo]"
2,Our Hospitality,[Natalie Talmadge],"[Comedy, Romance, Thriller]","[Buster Keaton, John G. Blystone]","[Jean C. Havez, Clyde Bruckman, Joseph A. Mitc..."
3,The King of Kings,[Reeka Roberts],"[Biography, Drama, History]",[Cecil B. DeMille],[Jeanie Macpherson]
4,I Cover the Waterfront,[James Hackett],"[Drama, Romance]",[James Cruze],"[Max Miller, Wells Root, Jack Jevne]"


In [20]:
def sanitize(x):
    try:
        #if cell contains list
        if isinstance(x, list):
            return [i.replace(' ','').lower() for i in x]
        #if cell contains string
        else:
            return [x.replace(' ','').lower()]
    except:
        print(x)
        
feature_cols = ['cast_name','genres','writer_name','director_name']

#Apply function sanitize 
for col in feature_cols:
    feature_df[col] = feature_df[col].apply(sanitize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_df[col] = feature_df[col].apply(sanitize)


In [21]:
def soup_feature(x):
    return ' '.join(x['cast_name']) + ' ' + ' '.join(x['genres']) + ' ' + ' '.join(x['director_name']) + ' ' + ' '.join(x['writer_name'])

feature_df['soup'] = feature_df.apply(soup_feature, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_df['soup'] = feature_df.apply(soup_feature, axis=1)


In [22]:
feature_df['soup'][0]

'natalietalmadge comedy romance davidkirkland johnemerson anitaloos'

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(feature_df['soup'])

print(count)
print(count_matrix.shape)

CountVectorizer(stop_words='english')
(1060, 10026)


### Build model with cosine similarity

Formula:<br>
<img src='img/math.svg' width=400px>

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

print(cosine_sim)

[[1.         0.15430335 0.35355339 ... 0.         0.         0.13608276]
 [0.15430335 1.         0.10910895 ... 0.         0.         0.        ]
 [0.35355339 0.10910895 1.         ... 0.         0.08703883 0.09622504]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.08703883 ... 0.         1.         0.10050378]
 [0.13608276 0.         0.09622504 ... 0.         0.10050378 1.        ]]


Content Recommender

In [25]:
indices = pd.Series(feature_df.index, index=feature_df['title']).drop_duplicates()

def content_recommender(title):
    idx = indices[title]
    sim_score = list(enumerate(cosine_sim[idx]))
    sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)
    sim_score = sim_score[1:11]

    movie_indices = [i[0] for i in sim_score]
    return base_df.iloc[movie_indices]

print("Other Movie Recommendations besides The Lion King:")
content_recommender('The Lion King').reset_index(drop=True)

Other Movie Recommendations besides The Lion King:


Unnamed: 0,knownForTitles,cast_name,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,director_name,writer_name
0,tt3040964,[Cristina Carrión Márquez],tt3040964,movie,The Jungle Book,The Jungle Book,0,2016.0,,106.0,"Adventure,Drama,Family",7.4,250994,[Jon Favreau],"[Justin Marks, Rudyard Kipling]"
1,tt0286336,[Francisco Bretas],tt0286336,tvSeries,The Animals of Farthing Wood,The Animals of Farthing Wood,0,1993.0,1995.0,25.0,"Adventure,Animation,Drama",8.3,3057,"[Elphin Lloyd-Jones, Philippe Leclerc]","[Valerie Georgeson, Colin Dann, Jenny McDade, ..."
2,tt7222086,[Hiroki Matsukawa],tt7222086,tvSeries,Made in Abyss,Made in Abyss,0,2017.0,,325.0,"Adventure,Animation,Drama",8.4,4577,"[Masayuki Kojima, Hitoshi Haga, Shinya Iino, T...","[Akihito Tsukushi, Keigo Koyanagi, Hideyuki Ku..."
3,tt0075147,[Joaquín Parra],tt0075147,movie,Robin and Marian,Robin and Marian,0,1976.0,,106.0,"Adventure,Drama,Romance",6.5,10830,[Richard Lester],[James Goldman]
4,tt0119051,[Chris Kosloski],tt0119051,movie,The Edge,The Edge,0,1997.0,,117.0,"Action,Adventure,Drama",6.9,65673,[Lee Tamahori],[David Mamet]
5,tt10068158,[Hiroki Matsukawa],tt10068158,movie,Made in Abyss: Journey's Dawn,Made in Abyss: Tabidachi no Yoake,0,2019.0,,139.0,"Adventure,Animation,Fantasy",7.4,81,[Masayuki Kojima],[Akihito Tsukushi]
6,tt0028657,[Bernard Loftus],tt0028657,movie,Boss of Lonely Valley,Boss of Lonely Valley,0,1937.0,,60.0,"Action,Adventure,Drama",6.2,41,[Ray Taylor],"[Frances Guihan, Forrest Brown]"
7,tt0107875,[Simon Mayal],tt0107875,movie,The Princess and the Goblin,The Princess and the Goblin,0,1991.0,,82.0,"Adventure,Animation,Comedy",6.8,2350,[József Gémes],"[Robin Lyons, George MacDonald]"
8,tt2356464,[Sina Müller],tt2356464,movie,Ostwind,Ostwind,0,2013.0,,101.0,"Adventure,Drama,Family",6.8,1350,[Katja von Garnier],"[Kristina Magdalena Henn, Lea Schmidbauer]"
9,tt6270328,[Jo Boag],tt6270328,tvSeries,The Skinner Boys: Guardians of the Lost Secrets,The Skinner Boys: Guardians of the Lost Secrets,0,2014.0,,23.0,"Adventure,Animation,Drama",7.8,12,"[Pablo De La Torre, Eugene Linkov, Jo Boag]","[David Witt, John Derevlany, David Evans, Pete..."
