In [11]:
import pandas as pd
import numpy as np
import pickle

# FILTERED ACTION MOVIE DATASET

- We are removing the following columns: ['movie_name', 'genre', 'description', 'director_id', 'star_id', 'votes', 'gross(in $)']
- We are also filtering the data from 2000 to 2022, while removing weird values from the year column
- We will be also removing certifications that don't belong to the USA

In [98]:
# Imported action_movie_data.csv and removed redunant columns
action_movie_data = pd.read_csv(r'../datasets/action_movie_data.csv')
action_movie_data = action_movie_data.drop(columns=['movie_name', 'genre', 'description', 'director_id', 'star_id', 'votes', 'gross(in $)']) 

# Reordered column rating
action_movie_data_columns = [col for col in action_movie_data.columns if col != 'rating']
action_movie_data_columns.append('rating')
action_movie_data = action_movie_data[action_movie_data_columns]

In [99]:
# Drop any non-numeric values (except NA values) from the df using year column
non_numeric_mask = action_movie_data['year'].str.match(r'[A-Z]+', na=False)
action_movie_data = action_movie_data.loc[~non_numeric_mask]

# Convert the year column to numeric data type
action_movie_data['year'] = pd.to_numeric(action_movie_data['year'], errors='coerce')
action_movie_data = action_movie_data[action_movie_data['year'].between(2000, 2022, inclusive='both') | action_movie_data['year'].isna()]

In [100]:
# Remove any non-USA certifactes from the df using certificate columns
print(action_movie_data['certificate'].unique())
values_to_keep = ['G', 'PG', 'PG-13', 'R', 'NC-17', np.nan, 'Not Rated', 'Unrated'] # Define a list of values to keep
action_movie_data = action_movie_data[action_movie_data['certificate'].isin(values_to_keep)] # Keep only the rows with the specified values
action_movie_data['certificate'].unique()

['PG-13' 'R' 'Not Rated' nan 'PG' 'TV-14' 'TV-MA' '18+' 'Unrated' 'G' 'M'
 'TV-PG' 'TV-Y7' 'TV-G' 'NC-17' 'Approved' 'TV-Y7-FV' 'TV-Y' '12' 'MA-13'
 'E' 'X' 'T' 'E10+' 'Open']


array(['PG-13', 'R', 'Not Rated', nan, 'PG', 'Unrated', 'G', 'NC-17'],
      dtype=object)

In [101]:
print(len(action_movie_data))
# number of missing values
action_movie_data.isnull().sum()

27965


movie_id           0
year            8238
certificate    21908
runtime        11751
director        3719
star            7016
rating         13608
dtype: int64

In [102]:
action_movie_data.to_csv('../datasets/filtered_action_movie_data.csv', index=False)

# FILLED ACTION MOVIE DATASET

After filling NaN values with movie data we got from a movie API, we will remove any rows that contain any NaN values, since we weren't able to find any information for those movies.

We will also convert columns to their respective data types, as well.

One thing we noticed was there were a lot of unique directors and actors, which becomes a problem when encoding, due to high cardinality, so we will opt in getting the top director and actor for each movie, to reduce that cardinality.

In [12]:
filled_action_movie_data = pd.read_csv(r'../datasets/csv/filled_action_movie_data.csv')
print(len(filled_action_movie_data))
filled_action_movie_data.isnull().sum()

10702


movie_id          0
year              0
certificate    6549
runtime        2416
director        295
star            673
rating         3540
dtype: int64

In [13]:
# Drop certificate since there are still a lot of missing values even after the filtering
processed_action_movie_data = filled_action_movie_data.drop(columns=['movie_id', 'certificate'])
processed_action_movie_data = processed_action_movie_data.dropna(axis=0)
processed_action_movie_data['runtime'] = processed_action_movie_data['runtime'].str.replace(' min', '')
processed_action_movie_data.isnull().sum()

year        0
runtime     0
director    0
star        0
rating      0
dtype: int64

In [14]:
len(processed_action_movie_data)

6648

In [15]:
# Numeric Type Conversions
processed_action_movie_data['year'] = pd.to_numeric(processed_action_movie_data['year'], errors='coerce', downcast='integer')
processed_action_movie_data['runtime'] = pd.to_numeric(processed_action_movie_data['runtime'], errors='coerce', downcast='integer')
processed_action_movie_data['rating'] = pd.to_numeric(processed_action_movie_data['rating'], errors='coerce', downcast='float')

In [16]:
processed_action_movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6648 entries, 0 to 10701
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   year      6648 non-null   int16  
 1   runtime   6648 non-null   int16  
 2   director  6648 non-null   object 
 3   star      6648 non-null   object 
 4   rating    6648 non-null   float32
dtypes: float32(1), int16(2), object(2)
memory usage: 207.8+ KB


In [17]:
# Make sure there is only one director and star per row
processed_action_movie_data.loc[:, 'director'] = processed_action_movie_data['director'].str.split(',').str[0].str.strip()
processed_action_movie_data.loc[:, 'star'] = processed_action_movie_data['star'].str.split(',').str[0].str.strip()

In [18]:
processed_action_movie_data.head()

Unnamed: 0,year,runtime,director,star,rating
0,2022,161,Ryan Coogler,Letitia Wright,6.9
1,2022,192,James Cameron,Sam Worthington,7.8
2,2022,139,Dan Kwan,Michelle Yeoh,8.0
3,2022,100,Jason Moore,Jennifer Lopez,5.4
4,2022,127,David Leitch,Brad Pitt,7.3


In [19]:
processed_action_movie_data.isnull().sum()

year        0
runtime     0
director    0
star        0
rating      0
dtype: int64

In [20]:
processed_action_movie_data = processed_action_movie_data.reset_index()

In [306]:
# Save it as CSV
processed_action_movie_data.to_csv('../datasets/csv/processed_action_movie_data.csv', index=False)

In [21]:
# Save it as pickle
with open('../datasets/pickle/processed_action_movie_data.pkl', 'wb') as f:
    pickle.dump(processed_action_movie_data, f)