In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv('data/movie_metadata.csv')
data.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

In [2]:
int_type = ['gross', 'budget', 'aspect_ratio', 'facenumber_in_poster', 'actor_3_facebook_likes']

for i in int_type:
    data = data[~np.isnan(data[i])]

str_type = ['content_rating', 'plot_keywords', 'color']

for s in str_type:
    data = data[~pd.isnull(data[s])]

data['language'].fillna(data['language'].mode()[0], inplace = True)


data.isnull().sum()

color                        0
director_name                0
num_critic_for_reviews       0
duration                     0
director_facebook_likes      0
actor_3_facebook_likes       0
actor_2_name                 0
actor_1_facebook_likes       0
gross                        0
genres                       0
actor_1_name                 0
movie_title                  0
num_voted_users              0
cast_total_facebook_likes    0
actor_3_name                 0
facenumber_in_poster         0
plot_keywords                0
movie_imdb_link              0
num_user_for_reviews         0
language                     0
country                      0
content_rating               0
budget                       0
title_year                   0
actor_2_facebook_likes       0
imdb_score                   0
aspect_ratio                 0
movie_facebook_likes         0
dtype: int64

In [3]:
data.drop_duplicates(subset = None, keep = 'first', inplace = True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3724 entries, 0 to 5042
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      3724 non-null   object 
 1   director_name              3724 non-null   object 
 2   num_critic_for_reviews     3724 non-null   float64
 3   duration                   3724 non-null   float64
 4   director_facebook_likes    3724 non-null   float64
 5   actor_3_facebook_likes     3724 non-null   float64
 6   actor_2_name               3724 non-null   object 
 7   actor_1_facebook_likes     3724 non-null   float64
 8   gross                      3724 non-null   float64
 9   genres                     3724 non-null   object 
 10  actor_1_name               3724 non-null   object 
 11  movie_title                3724 non-null   object 
 12  num_voted_users            3724 non-null   int64  
 13  cast_total_facebook_likes  3724 non-null   int64

In [4]:
data['gross'] = data['gross']/1000000
data['budget'] = data['budget']/1000000
data['profit'] = data['gross'] - data['budget']

# lets also check the name of Top 10 Profitable Movies
data[['profit','movie_title']].sort_values(by = 'profit', ascending  = False).head(10)

Unnamed: 0,profit,movie_title
0,523.505847,Avatar
29,502.177271,Jurassic World
26,458.672302,Titanic
3024,449.935665,Star Wars: Episode IV - A New Hope
3080,424.449459,E.T. the Extra-Terrestrial
17,403.279547,The Avengers
509,377.783777,The Lion King
240,359.544677,Star Wars: Episode I - The Phantom Menace
66,348.316061,The Dark Knight
439,329.999255,The Hunger Games


In [5]:
data['popularity'] = (data['num_user_for_reviews']/data['num_voted_users'])*data['movie_facebook_likes']

data.sort_values(by = 'popularity', ascending = False).head(10).reset_index()

Unnamed: 0,index,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,profit,popularity
0,10,Color,Zack Snyder,673.0,183.0,0.0,2000.0,Lauren Cohan,15000.0,330.249062,...,USA,PG-13,250.0,2016.0,4000.0,6.9,2.35,197000,80.249062,1599.794424
1,150,Color,Paul Feig,464.0,116.0,176.0,322.0,Kate McKinnon,783.0,118.099659,...,USA,PG-13,144.0,2016.0,370.0,5.5,2.35,62000,-25.900341,1076.336425
2,1582,Color,Paul Feig,464.0,116.0,176.0,322.0,Kate McKinnon,783.0,118.099659,...,USA,PG-13,144.0,2016.0,370.0,5.5,2.35,62000,-25.900341,1075.827482
3,96,Color,Christopher Nolan,712.0,169.0,22000.0,6000.0,Anne Hathaway,11000.0,187.991439,...,USA,PG-13,165.0,2014.0,11000.0,8.6,2.35,349000,22.991439,1024.560802
4,945,Color,Rob Marshall,321.0,125.0,252.0,10000.0,Meryl Streep,40000.0,127.997349,...,USA,PG,50.0,2014.0,11000.0,6.0,2.39,90000,77.997349,692.9372
5,73,Color,David Ayer,418.0,123.0,452.0,329.0,Robin Atkin Downes,10000.0,161.087183,...,USA,PG-13,175.0,2016.0,336.0,6.9,2.35,80000,-13.912817,652.816996
6,1190,Color,Sam Taylor-Johnson,362.0,129.0,456.0,716.0,Luke Grimes,1000.0,166.147885,...,USA,R,40.0,2015.0,935.0,4.1,2.35,101000,126.147885,624.306881
7,108,Color,Duncan Jones,275.0,123.0,0.0,648.0,Callum Rennie,3000.0,46.978995,...,USA,PG-13,160.0,2016.0,716.0,7.3,2.35,89000,-113.021005,622.790277
8,92,Color,Roland Emmerich,286.0,120.0,776.0,535.0,Sela Ward,890.0,102.315545,...,USA,PG-13,165.0,2016.0,812.0,5.5,2.35,67000,-62.684455,599.274128
9,179,Color,Alejandro G. Iñárritu,556.0,156.0,0.0,733.0,Tom Hardy,29000.0,183.635922,...,USA,R,135.0,2015.0,27000.0,8.1,2.35,190000,48.635922,555.933205


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3724 entries, 0 to 5042
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      3724 non-null   object 
 1   director_name              3724 non-null   object 
 2   num_critic_for_reviews     3724 non-null   float64
 3   duration                   3724 non-null   float64
 4   director_facebook_likes    3724 non-null   float64
 5   actor_3_facebook_likes     3724 non-null   float64
 6   actor_2_name               3724 non-null   object 
 7   actor_1_facebook_likes     3724 non-null   float64
 8   gross                      3724 non-null   float64
 9   genres                     3724 non-null   object 
 10  actor_1_name               3724 non-null   object 
 11  movie_title                3724 non-null   object 
 12  num_voted_users            3724 non-null   int64  
 13  cast_total_facebook_likes  3724 non-null   int64

In [7]:
data.to_csv('data/movie_metadata_cleaned.csv', index=False)

In [8]:
df = pd.read_csv('data/movie_metadata_cleaned.csv')

In [9]:
df['genres'].str.split('|')[0]

['Action', 'Adventure', 'Fantasy', 'Sci-Fi']

In [10]:
data['Moviegenres'] = data['genres'].str.split('|')
data['Genre1'] = data['Moviegenres'].apply(lambda x: x[0])
data['Genre2'] = data['Moviegenres'].apply(lambda x: x[1] if len(x) > 1 else x[0])
data['Genre3'] = data['Moviegenres'].apply(lambda x: x[2] if len(x) > 2 else x[0])
data['Genre4'] = data['Moviegenres'].apply(lambda x: x[3] if len(x) > 3 else x[0])
data['title_year'] = data['title_year'].astype('int')


In [11]:
def recommend_movies_on_actors(x):
    a = data[['movie_title','imdb_score']][data['actor_1_name'] == x]
    b = data[['movie_title','imdb_score']][data['actor_2_name'] == x]
    c = data[['movie_title','imdb_score']][data['actor_3_name'] == x]
    a = a.append(b)
    a = a.append(c)
    a = a.sort_values(by = 'imdb_score', ascending = False)
    return a.head(15)
    
recommend_movies_on_actors('Tom Cruise')

  a = a.append(b)
  a = a.append(c)


Unnamed: 0,movie_title,imdb_score
1868,Rain Man,8.0
75,Edge of Tomorrow,7.9
284,Minority Report,7.7
158,The Last Samurai,7.7
736,Collateral,7.6
1524,A Few Good Men,7.6
940,Interview with the Vampire: The Vampire Chroni...,7.6
155,Mission: Impossible - Ghost Protocol,7.4
135,Mission: Impossible - Rogue Nation,7.4
671,Eyes Wide Shut,7.3


In [12]:
from mlxtend.preprocessing import TransactionEncoder
x = data['genres'].str.split('|')
print(x)
te = TransactionEncoder()
x = te.fit_transform(x)
x = pd.DataFrame(x, columns = te.columns_)
genres = x.astype('int')
genres.insert(0, 'movie_title', data['movie_title'])
genres = genres.set_index('movie_title')

def recommendation_genres(gen):
    gen = genres[gen]
    similar_genres = genres.corrwith(gen)
    similar_genres = similar_genres.sort_values(ascending=False)
    similar_genres = similar_genres.iloc[1:]
    return similar_genres.head(3)

recommendation_genres('Action')

0            [Action, Adventure, Fantasy, Sci-Fi]
1                    [Action, Adventure, Fantasy]
2                   [Action, Adventure, Thriller]
3                              [Action, Thriller]
5                     [Action, Adventure, Sci-Fi]
                          ...                    
5026                      [Drama, Music, Romance]
5027                                      [Drama]
5033                    [Drama, Sci-Fi, Thriller]
5035    [Action, Crime, Drama, Romance, Thriller]
5042                                [Documentary]
Name: genres, Length: 3724, dtype: object


Adventure    0.318243
Thriller     0.304403
Sci-Fi       0.295256
dtype: float64

In [13]:
# [(pos, i) for pos, id in enumerate(x)]

In [39]:
x = genres.transpose()
def recommendation_movie(movie):    
    movie = x[movie+'\xa0']
    similar_movies = x.corrwith(movie)
    similar_movies = similar_movies.sort_values(ascending=False)
    similar_movies = similar_movies.iloc[1:]
    print(similar_movies)
    return similar_movies.head(30)

d = recommendation_movie('The Expendables')
for i in d.iteritems():
    # print(i[0])
    pass

print(list(d.index))

movie_title
21 Jump Street                                 1.000000
Igby Goes Down                                 1.000000
Lara Croft Tomb Raider: The Cradle of Life     1.000000
Space Chimps                                   1.000000
Rush                                           1.000000
                                                 ...   
Resident Evil: Retribution                    -0.271448
Down to Earth                                 -0.271448
The Legend of Bagger Vance                    -0.271448
Over the Hedge                                -0.271448
Spider-Man 3                                  -0.271448
Length: 3723, dtype: float64
['21 Jump Street\xa0', 'Igby Goes Down\xa0', 'Lara Croft Tomb Raider: The Cradle of Life\xa0', 'Space Chimps\xa0', 'Rush\xa0', "All the King's Men\xa0", 'The Incredible Hulk\xa0', 'This Means War\xa0', 'Kung Fu Panda\xa0', 'Snow Falling on Cedars\xa0']


In [15]:
import pickle
pickle.dump(x, open('data/x.pkl', 'wb'))