In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Data Cleaning (KaggleMovies.csv)

In [16]:
df = pd.read_csv('Datasets/KaggleMovies.csv')
pd.set_option('float_format', '{:f}'.format)

In [18]:
# Renaming columns
column_mapping = {
    'name' : 'Name',
    'rating' : 'Rating',
    'genre' : 'Genre',
    'year' : 'Year',
    'released' : 'Released',
    'score' : 'Score',
    'votes' : 'Votes',
    'director' : 'Director',
    'writer' : 'Writer',
    'star' : 'Star',
    'country' : 'Country',
    'budget' : 'Budget',
    'gross' : ' Gross Revenue',
    'company' : 'Company',
    'runtime' : 'Runtime'
}

df = df.rename(columns=column_mapping)

In [19]:
# Check for empty spaces
isnas = df.isna()
print(isnas)

       Name  Rating  Genre   Year  Released  Score  Votes  Director  Writer  \
0     False   False  False  False     False  False  False     False   False   
1     False   False  False  False     False  False  False     False   False   
2     False   False  False  False     False  False  False     False   False   
3     False   False  False  False     False  False  False     False   False   
4     False   False  False  False     False  False  False     False   False   
...     ...     ...    ...    ...       ...    ...    ...       ...     ...   
7663  False    True  False  False     False  False  False     False   False   
7664  False    True  False  False     False  False  False     False   False   
7665  False    True  False  False     False  False  False     False   False   
7666  False    True  False  False     False   True   True     False   False   
7667  False    True  False  False     False  False  False     False   False   

       Star  Country  Budget   Gross Revenue  Compa

In [20]:
# Remove rows with null values
df = df.dropna()
isnas2 = df.isna()
print(isnas2)

       Name  Rating  Genre   Year  Released  Score  Votes  Director  Writer  \
0     False   False  False  False     False  False  False     False   False   
1     False   False  False  False     False  False  False     False   False   
2     False   False  False  False     False  False  False     False   False   
3     False   False  False  False     False  False  False     False   False   
4     False   False  False  False     False  False  False     False   False   
...     ...     ...    ...    ...       ...    ...    ...       ...     ...   
7648  False   False  False  False     False  False  False     False   False   
7649  False   False  False  False     False  False  False     False   False   
7650  False   False  False  False     False  False  False     False   False   
7651  False   False  False  False     False  False  False     False   False   
7652  False   False  False  False     False  False  False     False   False   

       Star  Country  Budget   Gross Revenue  Compa

In [21]:
df['Votes'] = df['Votes'].astype('int')
df['Budget'] = df['Budget'].astype('int')
df['Runtime'] = df['Runtime'].astype('int')
df['Score'] = df['Score'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
# Note: For some reason the 'Gross Revenue' column is not detected making me unable to remove trailing 0s in the column
#df['Gross Revenue'] = df['Gross Revenue'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
df

Unnamed: 0,Name,Rating,Genre,Year,Released,Score,Votes,Director,Writer,Star,Country,Budget,Gross Revenue,Company,Runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000,46998772.000000,Warner Bros.,146
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000,58853106.000000,Columbia Pictures,104
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000,538375067.000000,Lucasfilm,124
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000,83453539.000000,Paramount Pictures,88
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000,39846344.000000,Orion Pictures,98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7648,Bad Boys for Life,R,Action,2020,"January 17, 2020 (United States)",6.6,140000,Adil El Arbi,Peter Craig,Will Smith,United States,90000000,426505244.000000,Columbia Pictures,124
7649,Sonic the Hedgehog,PG,Action,2020,"February 14, 2020 (United States)",6.5,102000,Jeff Fowler,Pat Casey,Ben Schwartz,United States,85000000,319715683.000000,Paramount Pictures,99
7650,Dolittle,PG,Adventure,2020,"January 17, 2020 (United States)",5.6,53000,Stephen Gaghan,Stephen Gaghan,Robert Downey Jr.,United States,175000000,245487753.000000,Universal Pictures,101
7651,The Call of the Wild,PG,Adventure,2020,"February 21, 2020 (United States)",6.8,42000,Chris Sanders,Michael Green,Harrison Ford,Canada,135000000,111105497.000000,20th Century Studios,100


# Data Cleaning (UCI Dataset)

In [None]:

# # Specify the URL or file path of the HTML document containing the dataset
# url_or_path = 'Datasets/actors.html'

# # Read HTML tables into a list of DataFrame objects
# dfs = pd.read_html(url_or_path)

# total_dataframe = []
                   
# for i, df in enumerate(dfs):
#     total_dataframe.append(dfs[i]) 

# final_df = pd.concat(total_dataframe, ignore_index=True)
# final_df = final_df[['stage', 'dow', 'birth', 'giv', 'gen', 'dob', 'dod', 'type', 'orig', 'pict', 'notes']]
# final_df.dropna(inplace=True)
# final_df.reset_index(drop=True, inplace=True)

In [29]:
# NOTE: UCI dataset not complete. The html file is formatted in way that makes creating a dataframe from it 
# difficult to accomplish. 

url = 'Datasets/main.html'

MAIN_df = pd.read_html(url)

total_dataframe = []

# Remove any NaNs
for i in range(len(MAIN_df)):
    MAIN_df[i].dropna(inplace=True)
#    print(pd.isna(MAIN_df[i]))

# Create a list of dataframes created from read_html(url)
for j, df in enumerate(MAIN_df):
    total_dataframe.append(MAIN_df[j])
    
combined = pd.concat(total_dataframe, ignore_index=True)
  
combined


  combined = pd.concat(total_dataframe, ignore_index=True)


Unnamed: 0,H,title,@1922,D:Hitchcock,prds,st,prc,prc.1,cat,aw,...,RGo,D:Gosnell,RWd,D:Weide,SWm,D:Weisman,VFu,D:Funari,Z99,D:UnYear99
0,H33,T:Rebecca,,D:Hitchcock,P:Selznick,,,,,"AA, AAN dir, H****",...,,,,,,,,,,
1,H34,T:Foreign Correspondent,,D:Hitchcock,P:Wanger,,,,,"H****, AAN",...,,,,,,,,,,
2,H37,T:Saboteur,,D:Hitchcock,"P:F.Lloyd, Skirball",,,,,H***,...,,,,,,,,,,
3,H39,T:Lifeboat,,D:Hitchcock,P:MacGowan,,,,,"H**, AAN dir",...,,,,,,,,,,
4,H42,T:Spellbound,,D:Hitchcock,P:Selznick,,,,,"H**, AAN, AAN dir",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3661,,,,,,,,,,,...,,,,,,,,,,
3662,,,,,,,,,,,...,,,,,,,,,,
3663,,,,,,,,,,,...,,,,,,,,,,
3664,,,,,,,,,,,...,,,,,,,,,,


# Data Cleaning (BoxOfficeCollections.csv)

In [None]:
BoxOff_df = pd.read_csv('Datasets/BoxOfficeCollections.csv')

In [None]:
print(list(BoxOff_df.columns.values))

# Renaming columns
column_mapping2 = {
    'Imdb_genre' : 'IMDB Genre',
    'metascore' : 'Metascore',
    'time_minute' : 'Time(Min)' 
}

BoxOff_df = BoxOff_df.rename(columns=column_mapping2)
BoxOff_df.head()

In [None]:

# Remove trailing zeros from the 'GrossRevenue' column
BoxOff_df['Adjusted Score'] = BoxOff_df['Adjusted Score'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
BoxOff_df['Box Office Collection'] = BoxOff_df['Box Office Collection'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
BoxOff_df['IMDB Rating'] = BoxOff_df['IMDB Rating'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
BoxOff_df['Metascore'] = BoxOff_df['Metascore'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
BoxOff_df['Time(Min)'] = BoxOff_df['Time(Min)'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)
BoxOff_df['Votes'] = BoxOff_df['Votes'].apply(lambda x: str(x).rstrip('0').rstrip('.') if isinstance(x, float) else x)

In [None]:
# Remove rows with null values
BoxOff_df = BoxOff_df.dropna()
isnas3 = BoxOff_df.isna()
print(isnas3)