In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
from collections import Counter
from itertools import compress

In [None]:
path_movies = '../input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv'
path_series = '../input/tv-shows-on-netflix-prime-video-hulu-and-disney/tv_shows.csv'
path_movies_ott = '../input/movies-on-ott-platforms/MoviesOnStreamingPlatforms_updated.csv'
path_series_ott = '../input/tv-shows-on-ott-platforms/TV_Shows.csv'

# Comparing with the New Datasets

In [None]:
df_movies = pd.read_csv(path_movies, index_col='Title')
df_movies.drop(['ID','Unnamed: 0'], axis=1, inplace=True)
df_series = pd.read_csv(path_series, index_col='Title')
df_series.drop(['Unnamed: 0'], axis=1, inplace=True)
df_series.rename(columns={'type': 'Type'}, inplace=True)

In [None]:
print(df_movies.columns)
print(df_series.columns)

In [None]:
df_movies_ott = pd.read_csv(path_movies_ott, index_col='Title')
df_movies_ott.drop('ID', axis=1, inplace=True)
df_movies_ott.columns

In [None]:
df_movies_ott.isnull().sum() - df_movies.isnull().sum()

In [None]:
df_movies.index.symmetric_difference(df_movies_ott.index).to_list()

In [None]:
df_movies_ott.loc['01:54',:]

In [None]:
df_series_ott = pd.read_csv(path_series_ott, index_col='Title')
df_series_ott.drop(['Unnamed: 0'], axis=1, inplace=True)
df_series_ott.rename(columns={'type': 'Type'}, inplace=True)
df_series_ott.columns

In [None]:
df_series_ott.isnull().sum() - df_series.isnull().sum()

In [None]:
df_series.index.symmetric_difference(df_series_ott.index).to_list()

## Conclusion:
* Both Series datasets are missing the 'Directors', 'Genres', 'Country', 'Language', and 'Runtime' columns.
* Both Movie datasets have the same missing values.
* The original Series dataset has one less missing value.
* The new Movie OTT dataset has some extra movies.
* Both Series datasets have the same series.

**Therefore we combine the original Series dataset with the new Movies OTT dataset.**

# Combining the First Dataset with Series Dataset

In [None]:
df_combined = pd.concat([df_movies_ott, df_series])

The "Type" variable indicates whether it's a movie or a series.
* 0: movie
* 1: series

In [None]:
df_combined.shape

In [None]:
df_combined = df_combined[~df_combined.index.duplicated()]

In [None]:
df_combined.shape

In [None]:
df_combined.isnull().sum()

# Cleaning the Combined Dataset

In [None]:
df_combined["Rotten Tomatoes"] = df_combined["Rotten Tomatoes"].str.rstrip("%").astype("float")
df_combined["Age"].replace({"all":"1+"},inplace=True)
df_combined["Age"] = df_combined["Age"].str.replace("+","", regex=False).astype("float")

clean_type = lambda row: 'movie' if row['Type'] == 0 else 'series'
df_combined['Type'] = df_combined.apply(clean_type, axis=1)

In [None]:
#df_combined.to_csv('./movies_and_series_clean.csv')

# Filling the Nulls

In [None]:
%%capture
!pip install git+https://github.com/alberanid/imdbpy

In [None]:
from imdb import IMDb
from tqdm.notebook import tqdm

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [None]:
path_combined = '../input/streaming-services-nulls-filled/movies_and_series_clean.csv'
df_combined = pd.read_csv(path_combined, index_col='Title').drop('Rotten Tomatoes', axis=1)
df_nulls = df_combined[df_combined.isnull().any(axis=1)]

In [None]:
df_nulls.loc[:,'Age'] = np.where(pd.isna(df_nulls['Age']), df_nulls['Age'], df_nulls['Age'].astype(str))
df_nulls1 = df_nulls.iloc[:3800,:]
df_nulls2 = df_nulls.iloc[3800:7600,:]
df_nulls3 = df_nulls.iloc[7600:11400,:]
df_nulls4 = df_nulls.iloc[11400:15000,:]
df_nulls5 = df_nulls.iloc[15000:15262,:]
print(df_nulls1.shape[0] + df_nulls2.shape[0] + df_nulls3.shape[0] + df_nulls4.shape[0] + df_nulls5.shape[0])

In [None]:
def fill_na(df):
    ia = IMDb()
    for index, row in tqdm(df.iterrows()):
        movie_id = ia.search_movie(index + f" ({row['Year']})")
        if not movie_id:
            movie_id = ia.search_movie(index)
        if movie_id:
            movie_id = movie_id[0].getID()
            movie = ia.get_movie(movie_id)
            if pd.isna(row['Age']):
                try:
                    df.at[index,'Age'] = movie.get('certificates')[0].rsplit(':')[-1]
                except (KeyError,TypeError):
                    pass
            if pd.isna(row['IMDb']):
                try:
                    df.at[index,'IMDb'] = movie['rating']
                except (KeyError,TypeError):
                    pass
            if pd.isna(row['Directors']):
                try:
                    df.at[index,'Directors'] = ','.join(director['name'] for director in movie['director'])
                except (KeyError,TypeError):
                    pass
            if pd.isna(row['Genres']):
                try:
                    df.at[index,'Genres'] = movie['genres']
                except (KeyError,TypeError):
                    pass
            if pd.isna(row['Country']):
                try:
                    df.at[index,'Country'] = ','.join(country for country in movie['countries'])
                except (KeyError,TypeError):
                    pass
            if pd.isna(row['Language']):
                try:
                    df.at[index,'Language'] = ','.join(language for language in movie['languages'])
                except (KeyError,TypeError):
                    pass
            if pd.isna(row['Runtime']):
                try:
                    df.at[index,'Runtime'] = float(movie['runtimes'][0])
                except (KeyError,TypeError):
                    pass

#fill_na(df_nulls5)

# Cleaning after Filling the Nulls

In [None]:
old_shape = df_combined.shape
old_nulls = df_combined.isnull().sum()

In [None]:
!head -n 1 "../input/streaming-services-nulls-filled/nulls_filled1.csv" > merged.csv
!tail -q -n +2 "../input/streaming-services-nulls-filled/nulls_filled1.csv" >> merged.csv
!tail -q -n +2 "../input/streaming-services-nulls-filled/nulls_filled2.csv" >> merged.csv
!tail -q -n +2 "../input/streaming-services-nulls-filled/nulls_filled3.csv" >> merged.csv
!tail -q -n +2 "../input/streaming-services-nulls-filled/nulls_filled4.csv" >> merged.csv
!tail -q -n +2 "../input/streaming-services-nulls-filled/nulls_filled5.csv" >> merged.csv

In [None]:
df_nulls = pd.read_csv('./merged.csv', index_col='Title')
!rm ./merged.csv
df_combined.update(df_nulls)

In [None]:
import re

has_number = lambda x: any(char.isdigit() for char in str(x))

def safe_float_convert(x):
    try:
        float(x)
        return True
    except ValueError:
        return False
    except TypeError:
        return False

remove_parenthesis = lambda x: np.nan if str(x)[0] == '(' and str(x)[-1] == ')' else x

extract_num = lambda x: float(''.join(re.findall(r'\d+', str(x)))) if len(re.findall(r'\d+', str(x))) == 1 else x

age_map = {'PG': 1.0,
           'TV-PG': 1.0,
           'M': 15.0,
           'U': 1.0,
           'G': 1.0,
           'A': 18.0,
           'Atp': 1.0,
           'TV-MA':17.0,
           'UA': 12.0,
           'R': 17.0,
           'S':18.0,
           'Tous publics': 1.0,
           'Tous Public': 1.0,
           'All': 1.0,
           'AL': 1.0,
           'TV-G': 1.0,
           'T': 1.0,
           'TV-Y': 2.0,
           'L': 1.0,
           'Livre': 1.0,
           'IIB': 1.0,
           'IIA': 1.0,
           'Btl': 1.0,
           'Tous publics avec avertissement': 1.0,
           'BPjM Restricted': 18.0,
           'Banned': 18.0,
           'Passed': 1.0,
           'Approved': 1.0,
           'C': 18.0,
           'MA': 17.0,
           'B': 1.0,
           'X': 18.0,
           'E': 1.0,
           'KT/EA': 1.0,
           'SU': 1.0,
           'GP': 1.0,
           'I': 1.0,
           'NRC': 12.0,
           'TE': 1.0,
           'KNT/ENA': 16.0,
           'U/A': 1.0,
           'K': 1.0,
           'II': 18.0,
           'TP': 1.0,
           'K-16/13': 16.0,
           'SOA': np.nan,
           'nan': np.nan,
           'Not Rated': np.nan,
           'Unrated': np.nan,
           '(January 10, 2011)': np.nan
          }


mask_safefloat = df_combined['Age'].map(safe_float_convert)
mask_hasnumber = df_combined['Age'].map(has_number)

df_combined.loc[~mask_hasnumber, 'Age'] = df_combined.loc[~mask_hasnumber, 'Age'].apply(remove_parenthesis)
df_combined.replace(age_map, inplace=True)
df_combined.loc[~mask_safefloat, 'Age'] = df_combined.loc[~mask_safefloat, 'Age'].apply(extract_num)
df_combined['Age'] = df_combined['Age'].astype(float)
df_combined['Age'].replace({1996.0:np.nan, 2145.0:np.nan, 0.0:1.0}, inplace=True)

df_combined['Age'].value_counts()

In [None]:
old_nulls

In [None]:
df_combined.isnull().sum()

In [None]:
#df_combined.to_csv('streaming_final.csv')

# Exploring the Combined Dataset

In [None]:
path_final = '../input/streaming-services-nulls-filled/streaming_final.csv'
df = pd.read_csv(path_final, index_col='Title')
df.head()

Oldest and Newest Movie

In [None]:
print(df['Year'].min())
print(df['Year'].max())

Number of Series vs Movies

In [None]:
def label_function(val):
    return f'{val / 100 * len(df):.0f}\n{val:.0f}%'

plt.figure(figsize=(12,5))
df.groupby('Type').size().plot(kind='pie', autopct=label_function,
                                textprops={'fontsize': 16, 'fontweight':'bold'})
plt.ylabel('Movies Vs. Series')
plt.show()

In [None]:
def col_2_str(data: pd.DataFrame, col_name: str, sep=',', collocations=False) -> str:
    text = ''
    col = data.dropna(subset=[col_name])[col_name].to_list()
    for elem in col:
        if collocations:
            elem = elem.replace(' ', '_')
        text += elem.replace(sep, ' ')
        text+= ' '
    return text

wc_color = lambda *args,**kwargs: 'black'

In [None]:
genres_text = col_2_str(df, 'Genres').replace('-', '')
wc = WordCloud(collocations=False, background_color='white', color_func=wc_color).generate(genres_text)
plt.figure(figsize=(14,8));
plt.imshow(wc, interpolation='bilinear');
plt.axis("off");

Most Common Spoken Languages

In [None]:
lang_text = col_2_str(df, 'Language')
wc = WordCloud(collocations=False, background_color='white', color_func=wc_color).generate(lang_text)
plt.figure(figsize=(14,8));
plt.imshow(wc, interpolation='bilinear');
plt.axis("off");

Most Common Directors

In [None]:
dir_text = col_2_str(df, 'Directors', collocations=True)
dir_text = dir_text.split()
dir_text = [director.replace('_', ' ') for director in dir_text]
dir_freq = Counter(dir_text)

wc = WordCloud(collocations=False, background_color='white').generate_from_frequencies(dir_freq)
plt.figure(figsize=(14,8));
plt.imshow(wc, interpolation='bilinear');
plt.axis("off");

Number of Productions per Country

In [None]:
country_text = col_2_str(df, 'Country', collocations=True)
country_text = country_text.split()
country_text = [country.replace('_', ' ') for country in country_text]
country_freq = Counter(country_text)

country_freq = pd.DataFrame.from_dict(country_freq, orient='index').reset_index().rename(columns={'index':'country', 0:'count'})
country_freq.head(20)

fig = px.choropleth(data_frame=country_freq,
                    locations='country', locationmode='country names',
                    color='count', color_continuous_scale='algae')

fig.show()