In [5]:
import pandas as pd
import warnings, datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
warnings.filterwarnings("ignore")

In [6]:
import random

movies = pd.read_csv('movie_metadata.csv')
movies['genres'] = movies.apply(lambda row: random.choice(row.genres.replace('|', ' ').split()), axis=1)


In [7]:
missing_df = movies.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(movies.shape[0]-missing_df['missing values'])/movies.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

Unnamed: 0,variable,missing values,filling factor (%)
0,gross,884,82.470752
1,budget,492,90.243902
2,aspect_ratio,329,93.476105
3,content_rating,303,93.991672
4,plot_keywords,153,96.966092
5,title_year,108,97.858418
6,director_name,104,97.937735
7,director_facebook_likes,104,97.937735
8,num_critic_for_reviews,50,99.008527
9,actor_3_facebook_likes,23,99.543922


In [8]:
count = movies['imdb_score'].value_counts()
count

6.7    223
6.6    201
7.2    195
6.5    186
6.4    185
7.3    184
7.0    184
7.1    181
6.8    181
6.1    179
6.3    176
6.2    175
6.9    174
7.4    147
5.9    144
7.5    140
6.0    131
7.6    124
5.8    124
5.7    117
7.7    114
5.6    112
7.8    106
5.4    104
5.5     98
5.3     92
8.0     75
7.9     75
5.1     71
8.1     69
      ... 
3.5     15
3.9     14
3.8     14
3.6     14
8.7     13
3.4     12
2.8      9
3.7      9
3.1      8
8.8      7
3.2      7
2.7      6
3.0      5
8.9      5
2.9      4
2.2      3
2.1      3
1.9      3
9.0      3
9.1      3
2.3      3
2.5      2
2.0      2
2.4      2
2.6      2
9.3      1
1.7      1
9.5      1
9.2      1
1.6      1
Name: imdb_score, Length: 78, dtype: int64

In [9]:
all_actors = set(movies['actor_1_name'].tolist() + movies['actor_2_name'].tolist() + movies['actor_3_name'].tolist())
all_actors
len(all_actors)

6256

In [10]:
all_genres = set(movies['genres'].tolist())
all_genres

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Thriller',
 'War',
 'Western'}

In [11]:
all_directors = set(movies['director_name'].tolist())
all_directors
len(all_directors)

2399

In [12]:
movies = movies[['imdb_score', 'budget', 'content_rating', 'director_name', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_3_name', 'actor_2_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'actor_1_name', 'cast_total_facebook_likes', 'genres']]
movies.head()

Unnamed: 0,imdb_score,budget,content_rating,director_name,director_facebook_likes,actor_3_facebook_likes,actor_3_name,actor_2_facebook_likes,actor_2_name,actor_1_facebook_likes,actor_1_name,cast_total_facebook_likes,genres
0,7.9,237000000.0,PG-13,James Cameron,0.0,855.0,Wes Studi,936.0,Joel David Moore,1000.0,CCH Pounder,4834,Action
1,7.1,300000000.0,PG-13,Gore Verbinski,563.0,1000.0,Jack Davenport,5000.0,Orlando Bloom,40000.0,Johnny Depp,48350,Adventure
2,6.8,245000000.0,PG-13,Sam Mendes,0.0,161.0,Stephanie Sigman,393.0,Rory Kinnear,11000.0,Christoph Waltz,11700,Action
3,8.5,250000000.0,PG-13,Christopher Nolan,22000.0,23000.0,Joseph Gordon-Levitt,23000.0,Christian Bale,27000.0,Tom Hardy,106759,Action
4,7.1,,,Doug Walker,131.0,,,12.0,Rob Walker,131.0,Doug Walker,143,Documentary


In [13]:
actor_id = 0
actor_name_dict = {}
for actor in all_actors:
    actor_name_dict[actor] = actor_id
    actor_id += 1

len(actor_name_dict)

6256

In [14]:
genre_id = 0
genre_dict = {}
for genre in all_genres:
    genre_dict[genre] = genre_id
    genre_id += 1

len(genre_dict)

23

In [15]:
def strip_ratings(rating):
    if rating == 'Unrated' or rating == 'NC-17' or rating == 'X' or rating == 'TV-MA':
        return 'R'
    elif rating == 'TV-14':
        return 'PG-13'
    elif rating == 'Approved' or rating == 'TV-PG' or rating == 'Not Rated' or rating == 'Passed' or rating == 'GP' or rating == 'M':
        return 'PG'
    elif rating == 'TV-G' or rating == 'TV-Y' or rating == 'TV-Y7':
        return 'G'
    return rating

movies['content_rating'] = movies['content_rating'].apply(strip_ratings)

In [16]:
all_content_ratings = set(movies['content_rating'].tolist())

content_id = 0
content_rating_dict = {}
for rating in all_content_ratings:
    content_rating_dict[rating] = content_id
    content_id += 1

len(content_rating_dict)

5

In [17]:
director_id = 0
director_name_dict = {}
for director in all_directors:
    director_name_dict[director] = director_id
    director_id += 1

len(director_name_dict)

2399

In [18]:
# discretize our data
movies['actor_1_name'] = movies.apply(lambda row: actor_name_dict[row.actor_1_name], axis=1)

In [19]:
movies['actor_2_name'] = movies.apply(lambda row: actor_name_dict[row.actor_2_name], axis=1)

In [20]:
movies['actor_3_name'] = movies.apply(lambda row: actor_name_dict[row.actor_3_name], axis=1)

In [21]:
movies['director_name'] = movies.apply(lambda row: director_name_dict[row.director_name], axis=1)

In [22]:
movies['genres'] = movies.apply(lambda row: genre_dict[row.genres], axis=1)

In [23]:
movies['content_rating'] = movies.apply(lambda row: content_rating_dict[row.content_rating], axis=1)

In [24]:
movies.to_csv('final_data.csv', index=False)

In [25]:
movies["budget"].fillna(movies["budget"].mean(), inplace=True)
movies.head()

Unnamed: 0,imdb_score,budget,content_rating,director_name,director_facebook_likes,actor_3_facebook_likes,actor_3_name,actor_2_facebook_likes,actor_2_name,actor_1_facebook_likes,actor_1_name,cast_total_facebook_likes,genres
0,7.9,237000000.0,2,319,0.0,855.0,5607,936.0,1631,1000.0,405,4834,15
1,7.1,300000000.0,2,1364,563.0,1000.0,4908,5000.0,855,40000.0,1619,48350,11
2,6.8,245000000.0,2,2155,0.0,161.0,3416,393.0,4732,11000.0,2329,11700,15
3,8.5,250000000.0,2,436,22000.0,23000.0,1510,23000.0,4913,27000.0,1933,106759,15
4,7.1,39752620.0,0,2047,131.0,,0,12.0,1442,131.0,6016,143,16


In [26]:
movies["budget"].mean()

39752620.436387606

In [27]:
movies.to_csv('final_data.csv', index=False)

In [28]:
missing_df = movies.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(movies.shape[0]-missing_df['missing values'])/movies.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

Unnamed: 0,variable,missing values,filling factor (%)
0,director_facebook_likes,104,97.937735
1,actor_3_facebook_likes,23,99.543922
2,actor_2_facebook_likes,13,99.742217
3,actor_1_facebook_likes,7,99.861194
4,imdb_score,0,100.0
5,budget,0,100.0
6,content_rating,0,100.0
7,director_name,0,100.0
8,actor_3_name,0,100.0
9,actor_2_name,0,100.0


In [29]:
movies["director_facebook_likes"].fillna(movies["director_facebook_likes"].mean(), inplace=True)
movies["actor_3_facebook_likes"].fillna(movies["actor_3_facebook_likes"].mean(), inplace=True)
movies["actor_2_facebook_likes"].fillna(movies["actor_2_facebook_likes"].mean(), inplace=True)
movies["actor_1_facebook_likes"].fillna(movies["actor_1_facebook_likes"].mean(), inplace=True)


In [30]:
missing_df = movies.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(movies.shape[0]-missing_df['missing values'])/movies.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

Unnamed: 0,variable,missing values,filling factor (%)
0,imdb_score,0,100.0
1,budget,0,100.0
2,content_rating,0,100.0
3,director_name,0,100.0
4,director_facebook_likes,0,100.0
5,actor_3_facebook_likes,0,100.0
6,actor_3_name,0,100.0
7,actor_2_facebook_likes,0,100.0
8,actor_2_name,0,100.0
9,actor_1_facebook_likes,0,100.0


In [31]:
movies.to_csv('final_data.csv', index=False)

In [32]:
count = movies['imdb_score'].value_counts()
count

6.7    223
6.6    201
7.2    195
6.5    186
6.4    185
7.3    184
7.0    184
7.1    181
6.8    181
6.1    179
6.3    176
6.2    175
6.9    174
7.4    147
5.9    144
7.5    140
6.0    131
7.6    124
5.8    124
5.7    117
7.7    114
5.6    112
7.8    106
5.4    104
5.5     98
5.3     92
8.0     75
7.9     75
5.1     71
8.1     69
      ... 
3.5     15
3.9     14
3.8     14
3.6     14
8.7     13
3.4     12
2.8      9
3.7      9
3.1      8
8.8      7
3.2      7
2.7      6
3.0      5
8.9      5
2.9      4
2.2      3
2.1      3
1.9      3
9.0      3
9.1      3
2.3      3
2.5      2
2.0      2
2.4      2
2.6      2
9.3      1
1.7      1
9.5      1
9.2      1
1.6      1
Name: imdb_score, Length: 78, dtype: int64

In [33]:
import pickle

with open('actors.pickle', 'wb') as handle:
    pickle.dump(actor_name_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('directors.pickle', 'wb') as handle:
    pickle.dump(director_name_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('genre.pickle', 'wb') as handle:
    pickle.dump(genre_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('content_ratings.pickle', 'wb') as handle:
    pickle.dump(content_rating_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [34]:
movies

Unnamed: 0,imdb_score,budget,content_rating,director_name,director_facebook_likes,actor_3_facebook_likes,actor_3_name,actor_2_facebook_likes,actor_2_name,actor_1_facebook_likes,actor_1_name,cast_total_facebook_likes,genres
0,7.9,2.370000e+08,2,319,0.000000,855.000000,5607,936.0,1631,1000.0,405,4834,15
1,7.1,3.000000e+08,2,1364,563.000000,1000.000000,4908,5000.0,855,40000.0,1619,48350,11
2,6.8,2.450000e+08,2,2155,0.000000,161.000000,3416,393.0,4732,11000.0,2329,11700,15
3,8.5,2.500000e+08,2,436,22000.000000,23000.000000,1510,23000.0,4913,27000.0,1933,106759,15
4,7.1,3.975262e+07,0,2047,131.000000,645.009761,0,12.0,1442,131.0,6016,143,16
5,6.6,2.637000e+08,2,1985,475.000000,530.000000,1251,632.0,45,640.0,1410,1873,15
6,6.2,2.580000e+08,2,75,0.000000,4000.000000,2989,11000.0,5542,24000.0,466,46055,15
7,7.8,2.600000e+08,4,464,15.000000,284.000000,48,553.0,652,799.0,2709,2036,17
8,7.5,2.500000e+08,2,1823,0.000000,19000.000000,744,21000.0,2413,26000.0,2964,92000,11
9,7.5,2.500000e+08,4,959,282.000000,10000.000000,3821,11000.0,5391,25000.0,1499,58753,11


In [35]:
from sklearn.ensemble import RandomForestRegressor
import pickle
from sklearn.externals import joblib

forest = joblib.load('model.pkl')
forest.predict([[185000000.0,2,436,0,0,0,12]])


# [['budget', 'content_rating', 'director_name', 'actor_3_name', 'actor_2_name', 'actor_1_name', 'genres']]

# budget 185000000.0
# content rating 2
# director name 436
# genre 12
# actor_1_name 4913
# actor_2_name 438
# actor 3 name 3865


array([ 6.98])

In [36]:
actor_name_dict

{nan: 0,
 'Jacki Weaver': 1,
 'Lymari Nadal': 2,
 'Paul McGill': 3,
 "Ta'Rhonda Jones": 4,
 'James Le Gros': 5,
 'Jemima West': 1051,
 'Rick Fox': 3167,
 'Tyra Banks': 8,
 'Reda Kateb': 9,
 'Sean Maher': 10,
 'Dorothy Lyman': 11,
 'Denis Lavant': 12,
 'Jack Elam': 13,
 'Julie Kavner': 14,
 'Brian Austin Green': 15,
 'Carol Block': 16,
 'Omar Benson Miller': 17,
 'Dell Yount': 18,
 'Ryan Kruger': 19,
 'Ed Quinn': 20,
 'Maximilian Dirr': 21,
 'Brian Bosworth': 22,
 'Dequina Moore': 23,
 'Frank Dillane': 25,
 'Adrienne Frantz': 26,
 'Paula Abdul': 27,
 'Neil Flynn': 28,
 'John Howard': 29,
 'Jesse L. Martin': 30,
 'David Eigenberg': 31,
 'Bobby Campo': 32,
 'Elena Anaya': 33,
 'Victor Webster': 34,
 'Tempestt Bledsoe': 35,
 'Benjamin A. Onyango': 36,
 'Marcia Wallace': 37,
 'David Barnes': 38,
 'Anne Bancroft': 39,
 'Frank McHugh': 40,
 'Jamie Ren\xc3\xa9e Smith': 41,
 'Breckin Meyer': 42,
 'James Patrick Stuart': 43,
 'Adam LeFevre': 44,
 'Samantha Morton': 45,
 'Hiroyuki Ikeuchi': 273,


In [37]:
count = movies['director_name'].value_counts()
count

0       104
1292     26
1506     22
1918     20
1492     20
1224     17
1953     16
1771     16
2018     16
672      15
1742     14
1757     13
1297     13
1124     13
1406     13
2144     13
75       13
1723     13
2097     13
2035     12
416      12
1281     12
2316     12
34       12
615      12
890      12
1168     11
1139     11
2256     11
1158     11
       ... 
690       1
550       1
542       1
534       1
450       1
386       1
390       1
394       1
406       1
410       1
418       1
422       1
430       1
438       1
442       1
446       1
458       1
530       1
462       1
466       1
470       1
474       1
490       1
494       1
510       1
514       1
518       1
522       1
526       1
2045      1
Name: director_name, Length: 2399, dtype: int64

In [38]:
all_genres

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Thriller',
 'War',
 'Western'}

In [39]:
content_rating_dict

{nan: 0, 'G': 3, 'PG': 4, 'PG-13': 2, 'R': 1}

In [40]:
count = movies['director_name'].value_counts()
count

0       104
1292     26
1506     22
1918     20
1492     20
1224     17
1953     16
1771     16
2018     16
672      15
1742     14
1757     13
1297     13
1124     13
1406     13
2144     13
75       13
1723     13
2097     13
2035     12
416      12
1281     12
2316     12
34       12
615      12
890      12
1168     11
1139     11
2256     11
1158     11
       ... 
690       1
550       1
542       1
534       1
450       1
386       1
390       1
394       1
406       1
410       1
418       1
422       1
430       1
438       1
442       1
446       1
458       1
530       1
462       1
466       1
470       1
474       1
490       1
494       1
510       1
514       1
518       1
522       1
526       1
2045      1
Name: director_name, Length: 2399, dtype: int64

In [47]:
genre_dict

{'Action': 15,
 'Adventure': 11,
 'Animation': 3,
 'Biography': 22,
 'Comedy': 5,
 'Crime': 1,
 'Documentary': 16,
 'Drama': 14,
 'Family': 19,
 'Fantasy': 20,
 'Film-Noir': 8,
 'History': 18,
 'Horror': 7,
 'Music': 4,
 'Musical': 17,
 'Mystery': 12,
 'Romance': 2,
 'Sci-Fi': 0,
 'Short': 13,
 'Sport': 21,
 'Thriller': 10,
 'War': 6,
 'Western': 9}