## Preprocessing

### Read the Dataset and Reorder the Columns

In [156]:
import pandas as pd
data = pd.read_csv("./streamingsites/streamingsites2.csv")

In [157]:
col_order = ["web_name","movie_or_tv", "title", "year_start", "year_end",\
             "seasons", "imdb_score", "rg_score", "rated", "genres"]
data = data[col_order]

In [181]:
print(data.shape)
data.head()

(27305, 10)


Unnamed: 0,web_name,movie_or_tv,title,year_start,year_end,seasons,imdb_score,rg_score,rated,genres
0,Netflix,Show,Trollhunters: Tales of Arcadia,2016,2018.0,3.0,8.4,68.0,7+ (TV-PG),"[Animation, Action & adventure, Comedy, Drama,..."
1,Netflix,Movie,The Other Side of the Wind,2018,,,6.8,68.0,18+ (R),"[Comedy, Independent, Drama]"
2,Netflix,Show,Patriot Act with Hasan Minhaj,2018,2020.0,6.0,8.2,69.0,18+ (TV-MA),[Comedy]
3,Netflix,Show,Formula 1: Drive to Survive,2019,2020.0,2.0,8.6,69.0,14+ (TV-14),"[Documentary, Sport]"
4,Netflix,Movie,Tamasha,2015,,,7.3,70.0,7+ (PG),"[Comedy, Drama, Romance]"


### Changing the data types

In [159]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27305 entries, 0 to 27304
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   web_name     27305 non-null  object 
 1   movie_or_tv  27305 non-null  object 
 2   title        27305 non-null  object 
 3   year_start   27305 non-null  int64  
 4   year_end     6147 non-null   float64
 5   seasons      6147 non-null   float64
 6   imdb_score   25242 non-null  float64
 7   rg_score     26708 non-null  float64
 8   rated        13156 non-null  object 
 9   genres       25495 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 2.1+ MB


In [160]:
data.loc[data['year_end'].notnull(), 'year_end'] = data.loc[data['year_end'].notnull(), 'year_end'].apply('int64')
data.loc[data['seasons'].notnull(), 'seasons'] = data.loc[data['seasons'].notnull(), 'seasons'].apply('int64')
data.loc[data['rg_score'].notnull(), 'rg_score'] = data.loc[data['rg_score'].notnull(), 'rg_score'].apply('int64')

### Rename the unique genres and change from string to list

In [161]:
data['genres'] = data['genres'].astype(str)
genres = list(map(lambda x: x.split(','), data['genres'].tolist()))
unique_genres = list(set([genre for row in genres for genre in row]))
sorted(unique_genres)

['action & adventure',
 'action-and-adventure',
 'animation',
 'anime',
 'biography',
 'children',
 'comedy',
 'crime',
 'cult',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'food',
 'game show',
 'game-show',
 'history',
 'home & garden',
 'home-and-garden',
 'horror',
 'independent',
 'lgbtq',
 'musical',
 'mystery',
 'nan',
 'reality',
 'romance',
 'science-fiction',
 'sport',
 'stand-up & talk',
 'stand-up-and-talk',
 'thriller',
 'travel']

In [164]:
data['genres'] = data['genres'].str.replace('action-and-adventure','action & adventure')
data['genres'] = data['genres'].str.replace('game-show','game show')
data['genres'] = data['genres'].str.replace('home-and-garden','home & garden')
data['genres'] = data['genres'].str.replace('stand-up-and-talk','stand-up & talk')
data['genres'] = data['genres'].str.replace('nan','NaN')

In [166]:
# Changing the column of genres from string to lists
data['genres'] = data['genres'].apply(lambda x: x.split(','))

### Capitalizing the texts

In [169]:
data['web_name'] = list(map(lambda x: x.capitalize(),data['web_name']))
data['movie_or_tv'] = list(map(lambda x: x.capitalize(),data['movie_or_tv']))
data['genres'] = list(map(lambda rows: list(map(lambda x: x.capitalize(), rows)), data['genres']))

In [172]:
data.head()

Unnamed: 0,web_name,movie_or_tv,title,year_start,year_end,seasons,imdb_score,rg_score,rated,genres
0,Netflix,Show,Trollhunters: Tales of Arcadia,2016,2018.0,3.0,8.4,68.0,7+ (TV-PG),"[Animation, Action & adventure, Comedy, Drama,..."
1,Netflix,Movie,The Other Side of the Wind,2018,,,6.8,68.0,18+ (R),"[Comedy, Independent, Drama]"
2,Netflix,Show,Patriot Act with Hasan Minhaj,2018,2020.0,6.0,8.2,69.0,18+ (TV-MA),[Comedy]
3,Netflix,Show,Formula 1: Drive to Survive,2019,2020.0,2.0,8.6,69.0,14+ (TV-14),"[Documentary, Sport]"
4,Netflix,Movie,Tamasha,2015,,,7.3,70.0,7+ (PG),"[Comedy, Drama, Romance]"


In [173]:
# data.to_csv("preprocesseddata.csv")

## EDA

In [182]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)

In [178]:
df = pd.read_csv("preprocesseddata.csv", index_col = 0)

In [179]:
df.head()

Unnamed: 0,web_name,movie_or_tv,title,year_start,year_end,seasons,imdb_score,rg_score,rated,genres
0,Netflix,Show,Trollhunters: Tales of Arcadia,2016,2018.0,3.0,8.4,68.0,7+ (TV-PG),"['Animation', 'Action & adventure', 'Comedy', ..."
1,Netflix,Movie,The Other Side of the Wind,2018,,,6.8,68.0,18+ (R),"['Comedy', 'Independent', 'Drama']"
2,Netflix,Show,Patriot Act with Hasan Minhaj,2018,2020.0,6.0,8.2,69.0,18+ (TV-MA),['Comedy']
3,Netflix,Show,Formula 1: Drive to Survive,2019,2020.0,2.0,8.6,69.0,14+ (TV-14),"['Documentary', 'Sport']"
4,Netflix,Movie,Tamasha,2015,,,7.3,70.0,7+ (PG),"['Comedy', 'Drama', 'Romance']"


In [186]:
df.describe()

Unnamed: 0,year_start,year_end,seasons,imdb_score,rg_score
count,27305.0,6147.0,6147.0,25242.0,26708.0
mean,2004.969529,2015.633968,3.257199,6.213596,47.600607
std,19.21107,9.274482,5.609403,1.386696,17.197285
min,1901.0,1939.0,1.0,1.0,10.0
25%,2003.0,2015.0,1.0,5.4,35.0
50%,2013.0,2020.0,2.0,6.4,45.0
75%,2017.0,2020.0,4.0,7.2,60.0
max,2020.0,2020.0,184.0,9.7,100.0
