In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier
import pickle

In [2]:
movie_data = pd.read_csv("imdb_movies.csv")
movie_data.shape

(10178, 12)

In [3]:
movie_data.head(5)

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [4]:
movie_data.describe()

Unnamed: 0,score,budget_x,revenue
count,10178.0,10178.0,10178.0
mean,63.497052,64882380.0,253140100.0
std,13.537012,57075650.0,277788000.0
min,0.0,1.0,0.0
25%,59.0,15000000.0,28588980.0
50%,65.0,50000000.0,152934900.0
75%,71.0,105000000.0,417802100.0
max,100.0,460000000.0,2923706000.0


In [5]:
movie_data.describe()

Unnamed: 0,score,budget_x,revenue
count,10178.0,10178.0,10178.0
mean,63.497052,64882380.0,253140100.0
std,13.537012,57075650.0,277788000.0
min,0.0,1.0,0.0
25%,59.0,15000000.0,28588980.0
50%,65.0,50000000.0,152934900.0
75%,71.0,105000000.0,417802100.0
max,100.0,460000000.0,2923706000.0


In [6]:
movie_data['country'][0]

'AU'

In [7]:
movie_data.isnull().sum()

names          0
date_x         0
score          0
genre         85
overview       0
crew          56
orig_title     0
status         0
orig_lang      0
budget_x       0
revenue        0
country        0
dtype: int64

In [8]:
movie_data.fillna("unknown" , inplace=True)

In [9]:
movie_data.isnull().sum()

names         0
date_x        0
score         0
genre         0
overview      0
crew          0
orig_title    0
status        0
orig_lang     0
budget_x      0
revenue       0
country       0
dtype: int64

In [10]:
movie_data['orig_lang'].value_counts().sort_values()

 Slovak                                    1
 Macedonian                                1
 Oriya                                     1
 Bokmål, Norwegian, Norwegian Bokmål       1
 Marathi                                   1
 Basque                                    1
 Dzongkha                                  1
 Irish                                     1
 Serbian                                   1
 Gujarati                                  1
 Serbo-Croatian                            1
 Latin                                     1
 Galician                                  1
 Romanian                                  2
 Kannada                                   2
 Czech                                     2
 Central Khmer                             2
 Malay                                     2
 Bengali                                   2
 Catalan, Valencian                        2
 Latvian                                   2
 Hungarian                                 2
 No Langua

In [11]:
def change_language(x):
    if x.strip() == "English":
        return 0
    else :
        return 1
    

movie_data['orig_lang'] = movie_data['orig_lang'].apply(change_language)
movie_data['orig_lang'].value_counts()


0    7417
1    2761
Name: orig_lang, dtype: int64

In [12]:
actors = {}
actor_index = 1

for i,vec in movie_data.iterrows():
    crew = list(map(lambda x : x.strip() ,vec["crew"].split(',')))
    for c in crew :
        if c not in actors:
            actors[c] = actor_index
            actor_index += 1


In [13]:
len(actors)

96552

In [14]:
genres = {}
genre_index = 1

for i,vec in movie_data.iterrows():
    genre = list(map(lambda x : x.strip() ,vec["genre"].split(',')))
    for g in genre :
        if g not in genres:
            genres[g] = genre_index
            genre_index += 1

In [15]:
genres

{'Drama': 1,
 'Action': 2,
 'Science Fiction': 3,
 'Adventure': 4,
 'Animation': 5,
 'Family': 6,
 'Fantasy': 7,
 'Comedy': 8,
 'Thriller': 9,
 'Crime': 10,
 'Horror': 11,
 'Mystery': 12,
 'History': 13,
 'War': 14,
 'Documentary': 15,
 'Romance': 16,
 'Music': 17,
 'Western': 18,
 'TV Movie': 19,
 'unknown': 20}

In [16]:
movie_data.columns

Index(['names', 'date_x', 'score', 'genre', 'overview', 'crew', 'orig_title',
       'status', 'orig_lang', 'budget_x', 'revenue', 'country'],
      dtype='object')

In [52]:
data = movie_data.drop(["orig_title", "overview" , "names" , "country" , "crew", "status" , "country", "date_x"] , axis=1)

In [53]:
data.head(5)

Unnamed: 0,score,genre,orig_lang,budget_x,revenue
0,73.0,"Drama, Action",0,75000000.0,271616700.0
1,78.0,"Science Fiction, Adventure, Action",0,460000000.0,2316795000.0
2,76.0,"Animation, Adventure, Family, Fantasy, Comedy",0,100000000.0,724459000.0
3,70.0,"Animation, Comedy, Family, Adventure, Fantasy",1,12300000.0,34200000.0
4,61.0,Action,0,77000000.0,340942000.0


In [54]:
for i,vec in data.iterrows():
    genre = list(map(lambda x : x.strip() ,vec["genre"].split(',')))
    ng = 0
    for g in genre:
        ng += 2^genres[g]
    data["genre"][i] = ng

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["genre"][i] = ng


In [55]:
data.head(10)

Unnamed: 0,score,genre,orig_lang,budget_x,revenue
0,73.0,3,0,75000000.0,271616700.0
1,78.0,7,0,460000000.0,2316795000.0
2,76.0,32,0,100000000.0,724459000.0
3,70.0,32,1,12300000.0,34200000.0
4,61.0,0,0,77000000.0,340942000.0
5,66.0,29,0,35000000.0,80000000.0
6,80.0,19,0,100000000.0,351349400.0
7,83.0,32,0,90000000.0,483480600.0
8,59.0,1,0,71000000.0,254946500.0
9,58.0,24,0,119200000.0,488962500.0


In [56]:
# (data['revenue'] - np.mean(data['revenue']))/np.std(data['revenue'])
data['revenue'] = (data['revenue'] - np.min(data['revenue']))/(np.max(data['revenue']) - np.min(data['revenue']))
data['budget_x'] = (data['budget_x'] - np.min(data['budget_x']))/(np.max(data['budget_x']) - np.min(data['budget_x']))

In [57]:
k_means = KMeans(n_clusters=25)
k_means.fit(data)



In [58]:
k_means.labels_[:10]

array([ 0, 20, 22,  4, 10,  4, 12, 22, 24,  1], dtype=int32)

In [59]:
movie_data[:10]

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,0,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,0,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,0,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,1,12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,0,77000000.0,340942000.0,US
5,Cocaine Bear,02/23/2023,66.0,"Thriller, Comedy, Crime","Inspired by a true story, an oddball group of ...","Keri Russell, Sari, Alden Ehrenreich, Eddie, O...",Cocaine Bear,Released,0,35000000.0,80000000.0,AU
6,John Wick: Chapter 4,03/23/2023,80.0,"Action, Thriller, Crime","With the price on his head ever increasing, Jo...","Keanu Reeves, John Wick, Donnie Yen, Caine, Bi...",John Wick: Chapter 4,Released,0,100000000.0,351349400.0,AU
7,Puss in Boots: The Last Wish,12/26/2022,83.0,"Animation, Family, Fantasy, Adventure, Comedy",Puss in Boots discovers that his passion for a...,"Antonio Banderas, Puss in Boots (voice), Salma...",Puss in Boots: The Last Wish,Released,0,90000000.0,483480600.0,AU
8,Attack on Titan,09/30/2022,59.0,"Action, Science Fiction","As viable water is depleted on Earth, a missio...","Paul Bianchi, Computer (voice), Erin Coker, Al...",Attack on Titan,Released,0,71000000.0,254946500.0,US
9,The Park,03/02/2023,58.0,"Action, Drama, Horror, Science Fiction, Thriller",A dystopian coming-of-age movie focused on thr...,"Chloe Guidry, Ines, Nhedrick Jabier, Bui, Carm...",The Park,Released,0,119200000.0,488962500.0,US


In [60]:
aglo_cluster = AgglomerativeClustering(n_clusters=25)
aglo_cluster.fit(data)

In [61]:
aglo_cluster.labels_[:20]

array([ 2, 23, 17,  6, 14,  6,  8, 17, 14, 22,  1,  3, 21, 23,  8, 21,  0,
       21,  3,  0])

In [62]:
movie_data[:20]

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,0,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,0,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,0,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,1,12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,0,77000000.0,340942000.0,US
5,Cocaine Bear,02/23/2023,66.0,"Thriller, Comedy, Crime","Inspired by a true story, an oddball group of ...","Keri Russell, Sari, Alden Ehrenreich, Eddie, O...",Cocaine Bear,Released,0,35000000.0,80000000.0,AU
6,John Wick: Chapter 4,03/23/2023,80.0,"Action, Thriller, Crime","With the price on his head ever increasing, Jo...","Keanu Reeves, John Wick, Donnie Yen, Caine, Bi...",John Wick: Chapter 4,Released,0,100000000.0,351349400.0,AU
7,Puss in Boots: The Last Wish,12/26/2022,83.0,"Animation, Family, Fantasy, Adventure, Comedy",Puss in Boots discovers that his passion for a...,"Antonio Banderas, Puss in Boots (voice), Salma...",Puss in Boots: The Last Wish,Released,0,90000000.0,483480600.0,AU
8,Attack on Titan,09/30/2022,59.0,"Action, Science Fiction","As viable water is depleted on Earth, a missio...","Paul Bianchi, Computer (voice), Erin Coker, Al...",Attack on Titan,Released,0,71000000.0,254946500.0,US
9,The Park,03/02/2023,58.0,"Action, Drama, Horror, Science Fiction, Thriller",A dystopian coming-of-age movie focused on thr...,"Chloe Guidry, Ines, Nhedrick Jabier, Bui, Carm...",The Park,Released,0,119200000.0,488962500.0,US


In [191]:
# 7 = puss in boots
c_7 = k_means.labels_[7]
movies_similar_7 = np.argwhere((k_means.labels_ == c_7))
for i in movies_similar_7[:10]:
    print(movie_data[i[0]:i[0]+1]["names"])

2    The Super Mario Bros. Movie
Name: names, dtype: object
7    Puss in Boots: The Last Wish
Name: names, dtype: object
49    Demon Slayer -Kimetsu no Yaiba- The Movie: Mug...
Name: names, dtype: object
70    Tetris
Name: names, dtype: object
79    Guillermo del Toro's Pinocchio
Name: names, dtype: object
80    A Frozen Rooster
Name: names, dtype: object
93    All Quiet on the Western Front
Name: names, dtype: object
94    Violent Night
Name: names, dtype: object
97    Devotion
Name: names, dtype: object
121    Turning Red
Name: names, dtype: object


In [194]:
# 1 = avatar 2
c_1 = k_means.labels_[1]
movies_similar_1 = np.argwhere((k_means.labels_ == c_1))
for i in movies_similar_1[:10]:
    print(movie_data[i[0]:i[0]+1]["names"])

1    Avatar: The Way of Water
Name: names, dtype: object
44    The Whale
Name: names, dtype: object
62    Top Gun: Maverick
Name: names, dtype: object
76    Spider-Man: No Way Home
Name: names, dtype: object
81    Dragon Ball Super: Super Hero
Name: names, dtype: object
104    Avengers: Infinity War
Name: names, dtype: object
111    Demon Slayer: Kimetsu no Yaiba Sibling's Bond
Name: names, dtype: object
139    Air
Name: names, dtype: object
145    Sing 2
Name: names, dtype: object
156    Everything Everywhere All at Once
Name: names, dtype: object


In [196]:
k_mean_file = open("k_mean_model.pickle", "wb")
pickle.dump(k_means, k_mean_file)
k_mean_file.close()

In [63]:
aglo_file = open("aglomerative_cluster_model.pickle", "wb")
pickle.dump(aglo_cluster, aglo_file)
aglo_file.close()

In [36]:
# Another way to add genre which is better than 2^x

genre_pd_dict = {}
for g in genres:
    genre_pd_dict[g] = np.zeros(movie_data.shape[0])

for i,row in data.iterrows():
    genre = list(map(lambda x : x.strip() ,row["genre"].split(','))) 
    for g in genre:
        genre_pd_dict[g][i] = 1

# genre_pd_dict

In [37]:
genre_df = pd.DataFrame(data=genre_pd_dict)
genre_df.head(5)

Unnamed: 0,Drama,Action,Science Fiction,Adventure,Animation,Family,Fantasy,Comedy,Thriller,Crime,Horror,Mystery,History,War,Documentary,Romance,Music,Western,TV Movie,unknown
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
data.head(5)

Unnamed: 0,score,genre,orig_lang,budget_x,revenue
0,73.0,"Drama, Action",0,75000000.0,271616700.0
1,78.0,"Science Fiction, Adventure, Action",0,460000000.0,2316795000.0
2,76.0,"Animation, Adventure, Family, Fantasy, Comedy",0,100000000.0,724459000.0
3,70.0,"Animation, Comedy, Family, Adventure, Fantasy",1,12300000.0,34200000.0
4,61.0,Action,0,77000000.0,340942000.0


In [39]:
data = data.drop(['genre'], axis=1)

new_data = pd.concat([data, genre_df] , axis=1)

new_data.head(5)

Unnamed: 0,score,orig_lang,budget_x,revenue,Drama,Action,Science Fiction,Adventure,Animation,Family,...,Horror,Mystery,History,War,Documentary,Romance,Music,Western,TV Movie,unknown
0,73.0,0,75000000.0,271616700.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,78.0,0,460000000.0,2316795000.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,76.0,0,100000000.0,724459000.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,70.0,1,12300000.0,34200000.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,61.0,0,77000000.0,340942000.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
k_means = KMeans(n_clusters=25)
k_means.fit(new_data)



In [46]:
# 7 = puss in boots
c_7 = k_means.labels_[7]
movies_similar_7 = np.argwhere((k_means.labels_ == c_7))
print("total recommendations : " , len(movies_similar_7))
for i in movies_similar_7[:2]:
    print(movie_data[i[0]:i[0]+1]["names"])

total recommendations :  382
7    Puss in Boots: The Last Wish
Name: names, dtype: object
12    Murder Mystery 2
Name: names, dtype: object


In [45]:
# 1 = avatar 2
c_1 = k_means.labels_[1]
movies_similar_1 = np.argwhere((k_means.labels_ == c_1))
print("total recommendations : " , len(movies_similar_1))
for i in movies_similar_1[:2]:
    print(movie_data[i[0]:i[0]+1]["names"])

total recommendations :  8
1    Avatar: The Way of Water
Name: names, dtype: object
76    Spider-Man: No Way Home
Name: names, dtype: object


In [47]:
aglo_cluster = AgglomerativeClustering(n_clusters=25)
aglo_cluster.fit(data)

In [50]:
# 1 = avatar 2
c_1 = aglo_cluster.labels_[1]
movies_similar_1 = np.argwhere((aglo_cluster.labels_ == c_1))
print("total recommendations : " , len(movies_similar_1))
for i in movies_similar_1[:2]:
    print(movie_data[i[0]:i[0]+1]["names"])

total recommendations :  8
1    Avatar: The Way of Water
Name: names, dtype: object
76    Spider-Man: No Way Home
Name: names, dtype: object


In [51]:
k_mean_file = open("k_mean_model.pickle", "wb")
pickle.dump(k_means, k_mean_file)
k_mean_file.close()

aglo_file = open("aglomerative_cluster_model.pickle", "wb")
pickle.dump(aglo_cluster, aglo_file)
aglo_file.close()