In [1]:
#import dependencies
import datetime
import pandas as pd
import numpy as np

## Data Cleaning 

In [2]:
#import data and create df
df = pd.read_csv('movies.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
id,505642,315162,646389,956101,536554
title,Black Panther: Wakanda Forever,Puss in Boots: The Last Wish,Plane,The Eighth Clause,M3GAN
genres,Action-Adventure-Science Fiction,Animation-Adventure-Comedy,Action-Adventure-Thriller,Thriller,Science Fiction-Horror-Comedy
original_language,en,en,en,la,en
overview,Queen Ramonda Shuri M’Baku Okoye and the Dora ...,Puss in Boots discovers that his passion for a...,After a heroic job of successfully landing his...,Kat and Borja appear to be a perfect couple bu...,A brilliant toy company roboticist uses artifi...
popularity,3952.862,3351.139,3337.265,2259.303,1836.162
production_companies,Marvel Studios,Universal Pictures-DreamWorks Animation,Di Bonaventura Pictures-MadRiver Pictures-Rive...,SDB Films-El Hombre Orquesta,Universal Pictures-Blumhouse Productions-Atomi...
release_date,2022-11-09,2022-12-07,2023-01-12,2022-04-29,2022-12-28
budget,250000000.0,90000000.0,25000000.0,0.0,12000000.0
revenue,855099029.0,442000000.0,46000000.0,0.0,167643991.0


In [3]:
#show columns
df.columns

Index(['id', 'title', 'genres', 'original_language', 'overview', 'popularity',
       'production_companies', 'release_date', 'budget', 'revenue', 'runtime',
       'status', 'tagline', 'vote_average', 'vote_count', 'credits',
       'keywords', 'poster_path', 'backdrop_path', 'recommendations'],
      dtype='object')

In [4]:
#show shape 
df.shape

(724194, 20)

In [5]:
#drop unnecessary columns
df = df.drop(['id','recommendations','backdrop_path','status','tagline','keywords', 'credits'], axis=1)

In [6]:
#show new shape
df.shape

(724194, 13)

In [7]:
#show the shape when revenue is 0
df[df['revenue'] == 0].shape

(707268, 13)

In [8]:
#replace 0 with NAN values 
df['revenue'] = df['revenue'].replace(0, np.nan)

In [9]:
#replace 0 with NAN values
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df['budget'] = df['budget'].replace(0, np.nan)

In [10]:
#show shape where budget is nan
df[df['budget'].isnull()].shape

(687588, 13)

In [11]:
#add return column that calculates revenue/budget
df['percent_return'] = df['revenue'] / df['budget']

In [12]:
#show new shape
df[df['percent_return'].isnull()].shape

(713115, 14)

In [13]:
df['profit'] = df['revenue'] - df['budget']

In [14]:
#replace 0 with NAN values
df['runtime'] = df['runtime'].replace(0, np.nan)

In [15]:
#drop all remaining null values 
df = df.dropna()

In [16]:
#show cleaned shape
df.shape

(9472, 15)

In [17]:
# Look at production_companies value counts for binning
production_counts = df["production_companies"].value_counts()
production_counts.head(40)

Metro-Goldwyn-Mayer                                             128
Paramount                                                       104
Warner Bros. Pictures                                           101
20th Century Fox                                                 75
Columbia Pictures                                                60
Universal Pictures                                               58
New Line Cinema                                                  40
RKO Radio Pictures                                               32
Queens Of Combat                                                 31
Walt Disney Productions                                          25
TriStar Pictures                                                 24
Marvel Studios                                                   24
Touchstone Pictures                                              24
Orion Pictures                                                   23
Yash Raj Films                                  

In [18]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
production_types_to_replace = list(production_counts[production_counts<10].index)

# Replace in dataframe
for prod in production_types_to_replace:
    df['production_companies'] = df['production_companies'].replace(prod,"Other")

# Check to make sure binning was successful
df['production_companies'].value_counts()

Other                                                           8402
Metro-Goldwyn-Mayer                                              128
Paramount                                                        104
Warner Bros. Pictures                                            101
20th Century Fox                                                  75
Columbia Pictures                                                 60
Universal Pictures                                                58
New Line Cinema                                                   40
RKO Radio Pictures                                                32
Queens Of Combat                                                  31
Walt Disney Productions                                           25
Touchstone Pictures                                               24
Marvel Studios                                                    24
TriStar Pictures                                                  24
Orion Pictures                    

In [19]:
df

Unnamed: 0,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,vote_average,vote_count,poster_path,percent_return,profit
0,Black Panther: Wakanda Forever,Action-Adventure-Science Fiction,en,Queen Ramonda Shuri M’Baku Okoye and the Dora ...,3952.862,Marvel Studios,2022-11-09,250000000.0,855099029.0,162.0,7.403,3586.0,/sv1xJUazXeYqALzczSZ3O6nkH75.jpg,3.420396,605099029.0
1,Puss in Boots: The Last Wish,Animation-Adventure-Comedy,en,Puss in Boots discovers that his passion for a...,3351.139,Other,2022-12-07,90000000.0,442000000.0,103.0,8.461,4047.0,/kuf6dutpsT0vSVehic3EZIqkOBt.jpg,4.911111,352000000.0
2,Plane,Action-Adventure-Thriller,en,After a heroic job of successfully landing his...,3337.265,Other,2023-01-12,25000000.0,46000000.0,107.0,6.851,652.0,/qi9r5xBgcc9KTxlOLjssEbDgO0J.jpg,1.840000,21000000.0
4,M3GAN,Science Fiction-Horror-Comedy,en,A brilliant toy company roboticist uses artifi...,1836.162,Other,2022-12-28,12000000.0,167643991.0,102.0,7.510,1794.0,/d9nBoowhjiiYc4FBNtQkPY7c11H.jpg,13.970333,155643991.0
5,Knock at the Cabin,Horror-Mystery-Thriller,en,While vacationing at a remote cabin a young gi...,1677.354,Other,2023-02-01,20000000.0,52000000.0,100.0,6.525,546.0,/dm06L9pxDOL9jNSK4Cb6y139rrG.jpg,2.600000,32000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717709,Requiem,Comedy,en,Some neighbors come to the old woman's house a...,0.600,Other,2018-11-24,500.0,500.0,10.0,0.000,0.0,/mvhcwiDeqc9gRvVsm00eWGC9Bqj.jpg,1.000000,0.0
719598,Rejects,Action-Drama-Science Fiction,en,Guns cash and four friends who have the wrong ...,0.600,Other,2021-07-30,500.0,500.0,13.0,8.000,1.0,/d6fRRziWTkgD1xSKWtQkVIozJY6.jpg,1.000000,0.0
720251,A Brave New World,Science Fiction-Horror,ko,A geek is left home alone and his negligence c...,0.600,Other,2012-04-05,1600000.0,636990.0,41.0,0.000,0.0,/i8qvcAmJfmcycbEQTVsZP9q2qi6.jpg,0.398119,-963010.0
720530,Gal Dil Di,Music,pa,Gal Dil Di Directed By - Parry Sandhu & Harja...,0.600,Other,2020-11-03,500.0,700.0,4.0,0.000,0.0,/yeBgOwDlsGrgIMf8j8fsuJjPiLJ.jpg,1.400000,200.0


In [20]:
genres_df = df["genres"].str.split("-", expand=True)
genres_df.columns = [f"genre_{i+1}" for i in range(genres_df.shape[1])]

# join the new columns with the original DataFrame
df = pd.concat([df, genres_df], axis=1)

# display the resulting DataFrame
df

Unnamed: 0,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,...,poster_path,percent_return,profit,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7
0,Black Panther: Wakanda Forever,Action-Adventure-Science Fiction,en,Queen Ramonda Shuri M’Baku Okoye and the Dora ...,3952.862,Marvel Studios,2022-11-09,250000000.0,855099029.0,162.0,...,/sv1xJUazXeYqALzczSZ3O6nkH75.jpg,3.420396,605099029.0,Action,Adventure,Science Fiction,,,,
1,Puss in Boots: The Last Wish,Animation-Adventure-Comedy,en,Puss in Boots discovers that his passion for a...,3351.139,Other,2022-12-07,90000000.0,442000000.0,103.0,...,/kuf6dutpsT0vSVehic3EZIqkOBt.jpg,4.911111,352000000.0,Animation,Adventure,Comedy,,,,
2,Plane,Action-Adventure-Thriller,en,After a heroic job of successfully landing his...,3337.265,Other,2023-01-12,25000000.0,46000000.0,107.0,...,/qi9r5xBgcc9KTxlOLjssEbDgO0J.jpg,1.840000,21000000.0,Action,Adventure,Thriller,,,,
4,M3GAN,Science Fiction-Horror-Comedy,en,A brilliant toy company roboticist uses artifi...,1836.162,Other,2022-12-28,12000000.0,167643991.0,102.0,...,/d9nBoowhjiiYc4FBNtQkPY7c11H.jpg,13.970333,155643991.0,Science Fiction,Horror,Comedy,,,,
5,Knock at the Cabin,Horror-Mystery-Thriller,en,While vacationing at a remote cabin a young gi...,1677.354,Other,2023-02-01,20000000.0,52000000.0,100.0,...,/dm06L9pxDOL9jNSK4Cb6y139rrG.jpg,2.600000,32000000.0,Horror,Mystery,Thriller,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717709,Requiem,Comedy,en,Some neighbors come to the old woman's house a...,0.600,Other,2018-11-24,500.0,500.0,10.0,...,/mvhcwiDeqc9gRvVsm00eWGC9Bqj.jpg,1.000000,0.0,Comedy,,,,,,
719598,Rejects,Action-Drama-Science Fiction,en,Guns cash and four friends who have the wrong ...,0.600,Other,2021-07-30,500.0,500.0,13.0,...,/d6fRRziWTkgD1xSKWtQkVIozJY6.jpg,1.000000,0.0,Action,Drama,Science Fiction,,,,
720251,A Brave New World,Science Fiction-Horror,ko,A geek is left home alone and his negligence c...,0.600,Other,2012-04-05,1600000.0,636990.0,41.0,...,/i8qvcAmJfmcycbEQTVsZP9q2qi6.jpg,0.398119,-963010.0,Science Fiction,Horror,,,,,
720530,Gal Dil Di,Music,pa,Gal Dil Di Directed By - Parry Sandhu & Harja...,0.600,Other,2020-11-03,500.0,700.0,4.0,...,/yeBgOwDlsGrgIMf8j8fsuJjPiLJ.jpg,1.400000,200.0,Music,,,,,,


In [21]:
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [22]:
df['month'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[1] if x != np.nan else np.nan)

In [23]:
df['day'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[2] if x != np.nan else np.nan)

In [24]:
df

Unnamed: 0,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,...,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,year,month,day
0,Black Panther: Wakanda Forever,Action-Adventure-Science Fiction,en,Queen Ramonda Shuri M’Baku Okoye and the Dora ...,3952.862,Marvel Studios,2022-11-09,250000000.0,855099029.0,162.0,...,Action,Adventure,Science Fiction,,,,,2022,11,09 00:00:00
1,Puss in Boots: The Last Wish,Animation-Adventure-Comedy,en,Puss in Boots discovers that his passion for a...,3351.139,Other,2022-12-07,90000000.0,442000000.0,103.0,...,Animation,Adventure,Comedy,,,,,2022,12,07 00:00:00
2,Plane,Action-Adventure-Thriller,en,After a heroic job of successfully landing his...,3337.265,Other,2023-01-12,25000000.0,46000000.0,107.0,...,Action,Adventure,Thriller,,,,,2023,01,12 00:00:00
4,M3GAN,Science Fiction-Horror-Comedy,en,A brilliant toy company roboticist uses artifi...,1836.162,Other,2022-12-28,12000000.0,167643991.0,102.0,...,Science Fiction,Horror,Comedy,,,,,2022,12,28 00:00:00
5,Knock at the Cabin,Horror-Mystery-Thriller,en,While vacationing at a remote cabin a young gi...,1677.354,Other,2023-02-01,20000000.0,52000000.0,100.0,...,Horror,Mystery,Thriller,,,,,2023,02,01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717709,Requiem,Comedy,en,Some neighbors come to the old woman's house a...,0.600,Other,2018-11-24,500.0,500.0,10.0,...,Comedy,,,,,,,2018,11,24 00:00:00
719598,Rejects,Action-Drama-Science Fiction,en,Guns cash and four friends who have the wrong ...,0.600,Other,2021-07-30,500.0,500.0,13.0,...,Action,Drama,Science Fiction,,,,,2021,07,30 00:00:00
720251,A Brave New World,Science Fiction-Horror,ko,A geek is left home alone and his negligence c...,0.600,Other,2012-04-05,1600000.0,636990.0,41.0,...,Science Fiction,Horror,,,,,,2012,04,05 00:00:00
720530,Gal Dil Di,Music,pa,Gal Dil Di Directed By - Parry Sandhu & Harja...,0.600,Other,2020-11-03,500.0,700.0,4.0,...,Music,,,,,,,2020,11,03 00:00:00


In [25]:
clean_df = df.drop(['genre_3','genre_4','genre_5','genre_6','genre_7'], axis=1)

In [26]:
clean_df = clean_df.drop(['genres'], axis=1)

In [27]:
clean_df

Unnamed: 0,title,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,vote_average,vote_count,poster_path,percent_return,profit,genre_1,genre_2,year,month,day
0,Black Panther: Wakanda Forever,en,Queen Ramonda Shuri M’Baku Okoye and the Dora ...,3952.862,Marvel Studios,2022-11-09,250000000.0,855099029.0,162.0,7.403,3586.0,/sv1xJUazXeYqALzczSZ3O6nkH75.jpg,3.420396,605099029.0,Action,Adventure,2022,11,09 00:00:00
1,Puss in Boots: The Last Wish,en,Puss in Boots discovers that his passion for a...,3351.139,Other,2022-12-07,90000000.0,442000000.0,103.0,8.461,4047.0,/kuf6dutpsT0vSVehic3EZIqkOBt.jpg,4.911111,352000000.0,Animation,Adventure,2022,12,07 00:00:00
2,Plane,en,After a heroic job of successfully landing his...,3337.265,Other,2023-01-12,25000000.0,46000000.0,107.0,6.851,652.0,/qi9r5xBgcc9KTxlOLjssEbDgO0J.jpg,1.840000,21000000.0,Action,Adventure,2023,01,12 00:00:00
4,M3GAN,en,A brilliant toy company roboticist uses artifi...,1836.162,Other,2022-12-28,12000000.0,167643991.0,102.0,7.510,1794.0,/d9nBoowhjiiYc4FBNtQkPY7c11H.jpg,13.970333,155643991.0,Science Fiction,Horror,2022,12,28 00:00:00
5,Knock at the Cabin,en,While vacationing at a remote cabin a young gi...,1677.354,Other,2023-02-01,20000000.0,52000000.0,100.0,6.525,546.0,/dm06L9pxDOL9jNSK4Cb6y139rrG.jpg,2.600000,32000000.0,Horror,Mystery,2023,02,01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717709,Requiem,en,Some neighbors come to the old woman's house a...,0.600,Other,2018-11-24,500.0,500.0,10.0,0.000,0.0,/mvhcwiDeqc9gRvVsm00eWGC9Bqj.jpg,1.000000,0.0,Comedy,,2018,11,24 00:00:00
719598,Rejects,en,Guns cash and four friends who have the wrong ...,0.600,Other,2021-07-30,500.0,500.0,13.0,8.000,1.0,/d6fRRziWTkgD1xSKWtQkVIozJY6.jpg,1.000000,0.0,Action,Drama,2021,07,30 00:00:00
720251,A Brave New World,ko,A geek is left home alone and his negligence c...,0.600,Other,2012-04-05,1600000.0,636990.0,41.0,0.000,0.0,/i8qvcAmJfmcycbEQTVsZP9q2qi6.jpg,0.398119,-963010.0,Science Fiction,Horror,2012,04,05 00:00:00
720530,Gal Dil Di,pa,Gal Dil Di Directed By - Parry Sandhu & Harja...,0.600,Other,2020-11-03,500.0,700.0,4.0,0.000,0.0,/yeBgOwDlsGrgIMf8j8fsuJjPiLJ.jpg,1.400000,200.0,Music,,2020,11,03 00:00:00


In [28]:
clean_df.to_csv('cleaned_movie.csv')