In [1]:
import matplotlib as mpl
from matplotlib.colors import ListedColormap
mpl.rcParams['figure.figsize'] = [12, 8]
mpl.rcParams['figure.dpi'] = 150 # 200 e.g. is really fine, but slower
mpl.rcParams['axes.edgecolor']='#FA6E4F'
mpl.rcParams['font.family'] = 'monospace'
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import clear_output
import ott_cleaning

In [3]:
ott = pd.read_csv('data/ott_fillcountries.csv',index_col=0)

In [4]:
ott.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,platform
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Netflix
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Netflix


In [5]:
# dropping columns that I am not interested in
drop_cols = ['director','cast','date_added']
ott = ott.drop(drop_cols,axis=1)

In [6]:
pd.DataFrame(ott.isnull().sum()[ott.isnull().sum() > 0]) # columns with null

Unnamed: 0,0
country,1904
rating,344
duration,3


In [7]:
ott['country'] = ott['country'].replace(np.nan,'NA')

In [8]:
ott['rating'] = ott['rating'].replace(np.nan,'NA')

In [9]:
ott['duration_min'] = np.vectorize(ott_cleaning.split_duration)(ott.duration, ott.type)

In [10]:
ott['seasons'] = np.vectorize(ott_cleaning.split_seasons)(ott.duration, ott.type)

In [11]:
ott['nCountry'] = np.vectorize(ott_cleaning.num_vals)(ott.country)

In [12]:
ott['nGenres'] = np.vectorize(ott_cleaning.num_vals)(ott.listed_in)

In [13]:
ott['New Rating'] = np.vectorize(ott_cleaning.convert_rating)(ott.rating)

In [14]:
ott['countries'] = ott['country'].apply(ott_cleaning.str2list)

In [15]:
ott['genres'] = ott['listed_in'].apply(ott_cleaning.str2list)

In [16]:
ott.head(4)

Unnamed: 0,show_id,type,title,country,release_year,rating,duration,listed_in,description,platform,duration_min,seasons,nCountry,nGenres,New Rating,countries,genres
0,s1,Movie,Dick Johnson Is Dead,United States,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Netflix,90.0,,1,1,Teens,[United States],[Documentaries]
1,s2,TV Show,Blood & Water,South Africa,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Netflix,,2.0,1,3,Adults,[South Africa],"[International TV Shows, TV Dramas, TV Mysteries]"
2,s3,TV Show,Ganglands,France,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,Netflix,,1.0,1,3,Adults,[France],"[Crime TV Shows, International TV Shows, TV Ac..."
3,s4,TV Show,Jailbirds New Orleans,United States,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",Netflix,,1.0,1,2,Adults,[United States],"[Docuseries, Reality TV]"


In [17]:
ott.columns

Index(['show_id', 'type', 'title', 'country', 'release_year', 'rating',
       'duration', 'listed_in', 'description', 'platform', 'duration_min',
       'seasons', 'nCountry', 'nGenres', 'New Rating', 'countries', 'genres'],
      dtype='object')

In [18]:
drop_cols = ['rating','duration','countries','genres']
ott = ott.drop(drop_cols,axis=1)

In [19]:
ott.columns

Index(['show_id', 'type', 'title', 'country', 'release_year', 'listed_in',
       'description', 'platform', 'duration_min', 'seasons', 'nCountry',
       'nGenres', 'New Rating'],
      dtype='object')

In [20]:
ott = ott.rename(columns = {'listed_in':'genres','duration_min':'duration','nCountry':'num_country','New Rating':'rating','nGenres':'num_genres'})

In [21]:
ott.columns

Index(['show_id', 'type', 'title', 'country', 'release_year', 'genres',
       'description', 'platform', 'duration', 'seasons', 'num_country',
       'num_genres', 'rating'],
      dtype='object')

In [23]:
# ott.to_csv('ott_tv_movie.csv')