In [1]:
import pandas as pd
import gc

Load dataframes for inspection

In [2]:
amaz_df = pd.read_csv("MLOpsReviews/amazon_prime_titles.csv")
disn_df = pd.read_csv("MLOpsReviews/disney_plus_titles.csv")
hulu_df = pd.read_csv("MLOpsReviews/hulu_titles.csv")
netf_df = pd.read_csv("MLOpsReviews/netflix_titles.csv")

amaz_df.head()
# disn_df.head()
# hulu_df.head()
# netf_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


##### 1. Create id column using first letter of platform and contents of show_id column

In [3]:
datafs = (amaz_df, disn_df, hulu_df, netf_df)
for dfs in datafs:
    dfs.rename({"show_id": "id"}, axis=1, inplace=True)
else:
    del dfs, datafs # Delete loop variables, as they are no longer needed and use up resources otherwise
    gc.collect()
    amaz_df["id"] = "a" + amaz_df["id"]
    disn_df["id"] = "d" + disn_df["id"]
    hulu_df["id"] = "h" + hulu_df["id"]
    netf_df["id"] = "n" + netf_df["id"]

##### 2. Fill NaN values in rating columns with "G" rating, for "General for All Audiences"

In [4]:
rating_cols = (amaz_df.rating, disn_df.rating, hulu_df.rating, netf_df.rating)
for col in rating_cols:
    col.fillna(value="G", inplace=True)
else:
    del col, rating_cols # Delete loop variables, as they are no longer needed and use up resources otherwise
    gc.collect()

##### 3. Convert any dates to "YYYY-mm-dd" format

In [5]:
datafs = (amaz_df, disn_df, hulu_df, netf_df)
for dfs in datafs:
    dfs["date_added"] = pd.to_datetime(dfs["date_added"])
else:
    del dfs, datafs
    gc.collect()

##### 4. Turn all text fields to lowercase

In [6]:
amaz_df = amaz_df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
disn_df = disn_df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
hulu_df = hulu_df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
netf_df = netf_df.applymap(lambda s: s.lower() if isinstance(s, str) else s)

##### 5. duration columns should be turned to 2 separate columns: duration_int & duration_type. The first must be of type int and be equal to the numeric part of the previous duration column, while the second must be a string equal to the non-numeric part of the duration column, indicating min or seasons, respectively]

In [7]:
datafs = (amaz_df, disn_df, hulu_df, netf_df)
for dfs in datafs:
    dfs[["duration_int", "duration_type"]] = dfs["duration"].str.split(" ", expand=True)
    dfs["duration_int"] = dfs["duration_int"].astype(pd.Int16Dtype())
    dfs.drop(["duration"], axis=1, inplace=True)
else:
    del dfs, datafs # Delete loop variables, as they are no longer needed and use up resources otherwise
    gc.collect()

In [8]:
amaz_df.head()
# disn_df.head()
# hulu_df.head()
# netf_df.head()

Unnamed: 0,id,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,duration_int,duration_type
0,as1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,"comedy, drama",a small fishing village must procure a local d...,113,min
1,as2,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,"drama, international",a metro family decides to fight a cyber crimin...,110,min
2,as3,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,2021-03-30,2017,g,"action, drama, suspense",after a man discovers his wife is cheating on ...,74,min
3,as4,movie,pink: staying true,sonia anderson,"interviews with: pink, adele, beyoncé, britney...",united states,2021-03-30,2014,g,documentary,"pink breaks the mold once again, bringing her ...",69,min
4,as5,movie,monster maker,giles foster,"harry dean stanton, kieran o'brien, george cos...",united kingdom,2021-03-30,1989,g,"drama, fantasy",teenage matt banting wants to work with a famo...,45,min
