# Cleaning the data

In [1]:
import pandas as pd
import numpy as np

Read in ratings and movies data

In [2]:
ratings_messy = pd.read_csv("IMDb ratings.csv")
films_messy = pd.read_csv("IMDb movies.csv")
films_messy.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


Set title IDs as index to smoothly join both tables, then drop unnecessary columns.

In [3]:
films_messy = films_messy.set_index('imdb_title_id')
ratings_messy = ratings_messy.set_index('imdb_title_id')

In [4]:
len(films_messy['metascore'].dropna())

13305

In [5]:
combined_messy = pd.concat([films_messy, ratings_messy], axis=1)
combined_messy.columns

Index(['title', 'original_title', 'year', 'date_published', 'genre',
       'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics', 'weighted_average_vote',
       'total_votes', 'mean_vote', 'median_vote', 'votes_10', 'votes_9',
       'votes_8', 'votes_7', 'votes_6', 'votes_5', 'votes_4', 'votes_3',
       'votes_2', 'votes_1', 'allgenders_0age_avg_vote',
       'allgenders_0age_votes', 'allgenders_18age_avg_vote',
       'allgenders_18age_votes', 'allgenders_30age_avg_vote',
       'allgenders_30age_votes', 'allgenders_45age_avg_vote',
       'allgenders_45age_votes', 'males_allages_avg_vote',
       'males_allages_votes', 'males_0age_avg_vote', 'males_0age_votes',
       'males_18age_avg_vote', 'males_18age_votes', 'males_30age_avg_vote',
       'males_30age_votes', 'males_45age_av

In [6]:
ratings = combined_messy.drop(['original_title',
                               'date_published',
                               'duration',
                               'country',
                               'language',
                               'director',
                               'writer',
                               'production_company',
                               'actors',
                               'description',
                               'avg_vote',
                               'votes',
                               'budget',
                               'usa_gross_income',
                               'worlwide_gross_income',
                               'reviews_from_users',
                               'reviews_from_critics',
                               'allgenders_0age_avg_vote', # Dropping this because so many NAs
                               'allgenders_0age_votes',
                               'allgenders_18age_votes',
                               'allgenders_30age_votes',
                               'allgenders_45age_votes',
                               'males_allages_votes',
                               'males_0age_avg_vote',      # Dropping this because so many NAs
                               'males_0age_votes',
                               'males_18age_votes',
                               'males_30age_votes',
                               'males_45age_votes',
                               'females_allages_votes',
                               'females_0age_avg_vote',    # Dropping this because so many NAs
                               'females_0age_votes',
                               'females_18age_votes',
                               'females_30age_votes',
                               'females_45age_votes',
                               'top1000_voters_rating',
                               'top1000_voters_votes',
                               'us_voters_rating',
                               'us_voters_votes',
                               'non_us_voters_rating',
                               'non_us_voters_votes'],
                              1)

  ratings = combined_messy.drop(['original_title',


As alluded to in the commented code, we planned to keep all of the average vote columns, but including the ones from critics under the age of 18 left a lot of missing data. This is presumably because it's not very common for people younger than 18 to be professional movie critics. The only column we want to lose data to is the metascore column, as that is our response.

In [10]:
ratings = ratings.dropna()

In [11]:
ratings.head()

Unnamed: 0_level_0,title,year,genre,metascore,weighted_average_vote,total_votes,mean_vote,median_vote,votes_10,votes_9,...,allgenders_30age_avg_vote,allgenders_45age_avg_vote,males_allages_avg_vote,males_18age_avg_vote,males_30age_avg_vote,males_45age_avg_vote,females_allages_avg_vote,females_18age_avg_vote,females_30age_avg_vote,females_45age_avg_vote
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0006864,Intolerance,1916,"Drama, History",99.0,7.8,13875,7.8,8.0,3477,2230,...,7.7,7.8,7.8,7.9,7.7,7.8,7.5,7.2,7.4,8.0
tt0017136,Metropolis,1927,"Drama, Sci-Fi",98.0,8.3,156076,8.1,8.0,37520,36112,...,8.2,8.3,8.3,8.3,8.2,8.3,8.3,8.2,8.3,8.3
tt0018037,Il cantante di jazz,1927,"Drama, Music, Musical",66.0,6.5,8866,6.8,7.0,999,589,...,6.4,6.6,6.4,6.5,6.4,6.6,6.6,6.7,6.4,6.8
tt0018773,Il circo,1928,"Comedy, Romance",90.0,8.1,27414,7.9,8.0,4542,5699,...,8.1,8.1,8.1,8.2,8.1,8.1,8.2,8.3,8.1,8.2
tt0019777,The Cocoanuts,1929,"Comedy, Musical",69.0,7.0,6900,7.2,7.0,813,496,...,6.9,7.1,6.9,6.8,6.8,7.1,7.2,7.1,7.1,7.3


# Integrate genres into the clean dataframe

There are 85855 films in our dataset, each of which have one or more genres associated with them in the following format:  
  
0                          Romance  
1          Biography, Crime, Drama  
2                            Drama  
3                   Drama, History  
4        Adventure, Drama, Fantasy  
                   ...              
85850                       Comedy  
85851                Comedy, Drama  
85852                        Drama  
85853                Drama, Family  
85854                        Drama  
  
We see that some films have multiple genres in the same string, so we have some cleaning to do. We need to figure out a way to find all the unique genres and create columns for each one, indicating which genre is associated with a given film using a 1 (True) or 0 (False).

Start by creating a list of the genres for each movie.

In [None]:
genres = pd.read_csv('IMDb movies.csv').genre
ratings['genres'] = genres

The following code iterates through the list of movie's genres and, if there are multiple, splits them into lists. If we come across a genre that we haven't seen yet, we add it to the list ```genres_unique```.

In [None]:
genres_unique = []

for film in range(0,len(genres)):
    film_genres = genres[film].split(", ")
    for genre in film_genres:
        if (genre not in genres_unique):
            genres_unique.append(genre)

Let's check if the list is indeed unique:

In [None]:
#pd.Series(genres_unique).nunique()
# 25
#len(genres_unique)
# 25

There are 25 unique genres associated with the films in the dataset. Now we will cycle through each genre, creating a column for each one and assigning 1 if the film contains that genre, and 0 if it doesn't.

In [None]:
for genre in genres_unique:
    ratings[genre] = ratings['genre'].str.contains(genre).astype(int)

# Drop the messy genres column
ratings = ratings.drop('genre', axis=1)

The dataset now includes genre data in one-hot encoded form.

In [None]:
ratings.head()

Export data:

In [None]:
ratings.to_csv('ratings_clean.csv')

In [None]:
ratings.columns

In [None]:
films_messy[['title']]