In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Data Cleaning (KaggleMovies.csv)

In [24]:
df = pd.read_csv('Datasets/KaggleMovies.csv')
pd.set_option('float_format', '{:f}'.format)

In [25]:
# Renaming columns
column_mapping = {
    'name' : 'Name',
    'rating' : 'Rating',
    'genre' : 'Genre',
    'year' : 'Year',
    'released' : 'Released',
    'score' : 'Score',
    'votes' : 'Votes',
    'director' : 'Director',
    'writer' : 'Writer',
    'star' : 'Star',
    'country' : 'Country',
    'budget' : 'Budget',
    'gross' : ' Gross Revenue',
    'company' : 'Company',
    'runtime' : 'Runtime'
}

df = df.rename(columns=column_mapping)
df.describe()

Unnamed: 0,Year,Score,Votes,Budget,Gross Revenue,Runtime
count,7668.0,7665.0,7665.0,5497.0,7479.0,7664.0
mean,2000.405451,6.390411,88108.504762,35589876.192651,78500541.017783,107.261613
std,11.153508,0.968842,163323.76391,41457296.601931,165725124.318757,18.581247
min,1980.0,1.9,7.0,3000.0,309.0,55.0
25%,1991.0,5.8,9100.0,10000000.0,4532055.5,95.0
50%,2000.0,6.5,33000.0,20500000.0,20205757.0,104.0
75%,2010.0,7.1,93000.0,45000000.0,76016691.5,116.0
max,2020.0,9.3,2400000.0,356000000.0,2847246203.0,366.0


In [26]:
# Check for empty spaces
isnas = df.isna()
print(isnas)

       Name  Rating  Genre   Year  Released  Score  Votes  Director  Writer  \
0     False   False  False  False     False  False  False     False   False   
1     False   False  False  False     False  False  False     False   False   
2     False   False  False  False     False  False  False     False   False   
3     False   False  False  False     False  False  False     False   False   
4     False   False  False  False     False  False  False     False   False   
...     ...     ...    ...    ...       ...    ...    ...       ...     ...   
7663  False    True  False  False     False  False  False     False   False   
7664  False    True  False  False     False  False  False     False   False   
7665  False    True  False  False     False  False  False     False   False   
7666  False    True  False  False     False   True   True     False   False   
7667  False    True  False  False     False  False  False     False   False   

       Star  Country  Budget   Gross Revenue  Compa

In [27]:
# Remove rows with null values
df = df.dropna()
isnas2 = df.isna()
print(isnas2)

       Name  Rating  Genre   Year  Released  Score  Votes  Director  Writer  \
0     False   False  False  False     False  False  False     False   False   
1     False   False  False  False     False  False  False     False   False   
2     False   False  False  False     False  False  False     False   False   
3     False   False  False  False     False  False  False     False   False   
4     False   False  False  False     False  False  False     False   False   
...     ...     ...    ...    ...       ...    ...    ...       ...     ...   
7648  False   False  False  False     False  False  False     False   False   
7649  False   False  False  False     False  False  False     False   False   
7650  False   False  False  False     False  False  False     False   False   
7651  False   False  False  False     False  False  False     False   False   
7652  False   False  False  False     False  False  False     False   False   

       Star  Country  Budget   Gross Revenue  Compa

In [28]:
df['Votes'] = df['Votes'].astype('int')
df['Budget'] = df['Budget'].astype('int')
df['Runtime'] = df['Runtime'].astype('int')
df['Score'] = df['Score'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
# Note: For some reason the 'Gross Revenue' column is not detected making me unable to remove trailing 0s in the column
#df['Gross Revenue'] = df['Gross Revenue'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
#df

# Data Cleaning (BoxOfficeCollections.csv)

In [29]:
BoxOff_df = pd.read_csv('Datasets/BoxOfficeCollections.csv')
BoxOff_df

Unnamed: 0,Movie,Year,Score,Adjusted Score,Director,Cast,Consensus,Box Office Collection,Imdb_genre,IMDB Rating,metascore,time_minute,Votes
0,Hot Rod,2007,39,42.918000,Akiva Schaffer,"Andy Samberg, Jorma Taccone, Bill Hader, Danny...","For Rod Kimball (Andy Samberg), performing stu...",14371564.000000,Comedy,6.700000,43.000000,88.000000,84956.000000
1,Game Night,2018,85,99.838000,John Francis Daley,"Jason Bateman, Rachel McAdams, Kyle Chandler, ...",Max and Annie's weekly game night gets kicked ...,117378084.000000,Comedy,6.900000,66.000000,100.000000,229292.000000
2,The First Wives Club,1996,49,53.174000,Hugh Wilson,"Goldie Hawn, Bette Midler, Diane Keaton, Maggi...",Despondent over the marriage of her ex-husband...,181489203.000000,Comedy,6.400000,58.000000,103.000000,48413.000000
3,Scary Movie,2000,52,54.973000,Keenen Ivory Wayans,"Shawn Wayans, Marlon Wayans, Cheri Oteri, Shan...","Defying the very notion of good taste, Scary M...",277200000.000000,Comedy,6.200000,48.000000,88.000000,254927.000000
4,Blockers,2018,84,96.883000,Kay Cannon,"Leslie Mann, Ike Barinholtz, John Cena, Kathry...","Julie, Kayla and Sam are three high school sen...",94523781.000000,Comedy,6.200000,69.000000,102.000000,78498.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1373,Welcome to Chechnya,2020,100,104.537000,David France,,Activists risk their lives to confront Russian...,,,,,,
1374,White Riot,2019,100,102.793000,Rubika Shah,"Pauline Black, Topper Headon, Mick Jones, Denn...",An exploration of how punk influenced politics...,185528.000000,,,,,
1375,The Woman Who Ran,2020,100,101.553000,Hong Sang-soo,"Kim Min-hee, Seo Young-hwa, Sae-Byuk Kim, Kwon...","The 24th feature from Hong Sangsoo, THE WOMAN ...",,,,,,
1376,Woodlands Dark and Days Bewitched: A History o...,2021,100,101.978000,Kier-La Janisse,"Piers Haggard, Lawrence Gordon Clark, Jeremy D...",WOODLANDS DARK AND DAYS BEWITCHED is the first...,,,,,,


In [30]:
print(list(BoxOff_df.columns.values))

# Renaming columns
column_mapping2 = {
    'Imdb_genre' : 'IMDB Genre',
    'metascore' : 'Metascore',
    'time_minute' : 'Time(Min)' 
}

BoxOff_df = BoxOff_df.rename(columns=column_mapping2)
BoxOff_df.head()

['Movie', 'Year', 'Score', 'Adjusted Score', 'Director', 'Cast', 'Consensus', 'Box Office Collection', 'Imdb_genre', 'IMDB Rating', 'metascore', 'time_minute', 'Votes']


Unnamed: 0,Movie,Year,Score,Adjusted Score,Director,Cast,Consensus,Box Office Collection,IMDB Genre,IMDB Rating,Metascore,Time(Min),Votes
0,Hot Rod,2007,39,42.918,Akiva Schaffer,"Andy Samberg, Jorma Taccone, Bill Hader, Danny...","For Rod Kimball (Andy Samberg), performing stu...",14371564.0,Comedy,6.7,43.0,88.0,84956.0
1,Game Night,2018,85,99.838,John Francis Daley,"Jason Bateman, Rachel McAdams, Kyle Chandler, ...",Max and Annie's weekly game night gets kicked ...,117378084.0,Comedy,6.9,66.0,100.0,229292.0
2,The First Wives Club,1996,49,53.174,Hugh Wilson,"Goldie Hawn, Bette Midler, Diane Keaton, Maggi...",Despondent over the marriage of her ex-husband...,181489203.0,Comedy,6.4,58.0,103.0,48413.0
3,Scary Movie,2000,52,54.973,Keenen Ivory Wayans,"Shawn Wayans, Marlon Wayans, Cheri Oteri, Shan...","Defying the very notion of good taste, Scary M...",277200000.0,Comedy,6.2,48.0,88.0,254927.0
4,Blockers,2018,84,96.883,Kay Cannon,"Leslie Mann, Ike Barinholtz, John Cena, Kathry...","Julie, Kayla and Sam are three high school sen...",94523781.0,Comedy,6.2,69.0,102.0,78498.0


In [31]:

# Remove trailing zeros from the 'GrossRevenue' column
BoxOff_df['Adjusted Score'] = BoxOff_df['Adjusted Score'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
BoxOff_df['Box Office Collection'] = BoxOff_df['Box Office Collection'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
BoxOff_df['IMDB Rating'] = BoxOff_df['IMDB Rating'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
BoxOff_df['Metascore'] = BoxOff_df['Metascore'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
BoxOff_df['Time(Min)'] = BoxOff_df['Time(Min)'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
BoxOff_df['Votes'] = BoxOff_df['Votes'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)

In [32]:
# Remove rows with null values
BoxOff_df = BoxOff_df.dropna()
isnas3 = BoxOff_df.isna()
print(isnas3)

      Movie   Year  Score  Adjusted Score  Director   Cast  Consensus  \
0     False  False  False           False     False  False      False   
1     False  False  False           False     False  False      False   
2     False  False  False           False     False  False      False   
3     False  False  False           False     False  False      False   
4     False  False  False           False     False  False      False   
...     ...    ...    ...             ...       ...    ...        ...   
1358  False  False  False           False     False  False      False   
1363  False  False  False           False     False  False      False   
1367  False  False  False           False     False  False      False   
1369  False  False  False           False     False  False      False   
1370  False  False  False           False     False  False      False   

      Box Office Collection  IMDB Genre  IMDB Rating  Metascore  Time(Min)  \
0                     False       False      