In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path = '/kaggle/input/netflix-original-films-imdb-scores/NetflixOriginals.csv'
df = pd.read_csv(path)
df.head()

In [None]:
print(df.info())
print('-'*40)
print(df.isnull().sum())

In [None]:
f, axes = plt.subplots(1, 2, figsize=(18, 7))
ax = axes.ravel()

sns.distplot(df['IMDB Score'], ax=ax[0])
ax[0].set_title('IMDB Score', fontsize=20)
sns.boxplot(df['IMDB Score'], ax=ax[1])

print(df['IMDB Score'].describe())

In [None]:
f, axes = plt.subplots(1, 2, figsize=(18, 7))
ax = axes.ravel()

sns.distplot(df['Runtime'], ax=ax[0])
ax[0].set_title('Runtime', fontsize=20)
sns.boxplot(df['Runtime'], ax=ax[1])

# Categorical value_counts Top 10

In [None]:
object_col = df.select_dtypes(['category', 'object', 'bool']).columns.to_list()

for col in object_col:
    values = df[col].value_counts().values[:10]
    index = df[col].value_counts().index[:10]
    fig = px.pie(values=values,
                labels=index,
                names=index,
                 title=col
                )
    fig.show()
    print(f'Unique Values: {len(df[col].unique())}')
    print(f'Missing Values: {df[col].isna().sum()}')

In [None]:
df.rename({'IMDB Score': 'IMDB_score'}, inplace=True)
df.describe(include='O')

1. `Documentry`が多い傾向
2. `English`が多数

# Genre

In [None]:
x = df.pivot_table(index=['Genre'], values=['IMDB Score'], aggfunc=['mean', 'count'])
x.columns = ['IMDB Score mean', 'IMDB Score count']
x.sort_values(by='IMDB Score mean', ascending=False).head(10).style.background_gradient(cmap='Blues')

In [None]:
x.sort_values(by='IMDB Score mean', ascending=True).head(10).style.background_gradient(cmap='Blues')

In [None]:
only_genre = df.Genre.value_counts()
only_genre = only_genre[only_genre == 1].index

print(f'Only 1 Gerne: {len(only_genre)}')
print(f'Toatl Gerne: {len(df.Genre.value_counts().index)}')

In [None]:
df_only_gerne = df[(df.Genre.isin(only_genre)) & (df['IMDB Score'] > df['IMDB Score'].median())]
only_genre_score = df_only_gerne['IMDB Score'].mean()

df_gerne = df[~df.Genre.isin(only_genre) & (df['IMDB Score'] > df['IMDB Score'].median())]
not_only_gerne_score = df_gerne['IMDB Score'].mean()

values = [only_genre_score, not_only_gerne_score]
labels = ['only_genre_score', 'not_only_gerne_score']

fig = px.bar(x=labels, y=values, title='IMDB Score > medain and Gerne value_counts ==1 or >1')
fig.show()

In [None]:
df_only_gerne = df[(df.Genre.isin(only_genre)) & (df['IMDB Score'] < df['IMDB Score'].median())]
only_genre_score = df_only_gerne['IMDB Score'].mean()

df_gerne = df[~df.Genre.isin(only_genre) & (df['IMDB Score'] > df['IMDB Score'].median())]
not_only_gerne_score = df_gerne['IMDB Score'].mean()

values = [only_genre_score, not_only_gerne_score]
labels = ['only_genre_score', 'not_only_gerne_score']

fig = px.bar(x=labels, y=values, title='IMDB Score < medain and Gerne value_counts ==1 or >1')
fig.show()

Gerneのユニークなカウント数が`1`ならばScoreが減少傾向。上下比較してみても評価数が多いほどその平均であるScoreは上昇傾向にある。これはScoreが低ければより顕著に表れている  
ここで出現率の高い上位の評価点を見てみることにする。

In [None]:
top5_counts_genre = list(df.Genre.value_counts().index[:5]) 
x = df[df.Genre.isin(top5_counts_genre)]
x = x.groupby('Genre').mean().loc[:, ['IMDB Score']]
values = list(x['IMDB Score'])

labels = list(x.index)

dd = pd.DataFrame({'score': values, 'Genre': labels})

sns.barplot(data=dd, x=dd.Genre, y=dd.score)
sns.lineplot(x=dd.Genre, y=df['IMDB Score'].median())
sns.lineplot(x=dd.Genre, y=df['IMDB Score'].max())
sns.lineplot(x=dd.Genre, y=df['IMDB Score'].min())

ただし、評価数が多いジャンルといっても評価点に差異は見られない

# Language

In [None]:
english = df[df.Language.str.contains('English')]
x = english.Language.value_counts()

px.pie(values=x.values, labels=x.index, names=x.index)

`English`が圧倒的な割合を占めている

In [None]:
english = df[df.Language == 'English']
not_english = df[df.Language != 'English']

score = english['IMDB Score'].mean()
not_score = not_english['IMDB Score'].mean()

print(f'English Score: {score}')
print(f'Not English Score: {not_score}')

eng_ge = english.Genre.unique()
not_eng_ge = not_english.Genre.unique()
all_ge = df.Genre.unique()


print('English Genre counts: ' ,len(eng_ge)/len(all_ge))
print('Not English Genre counts: ' ,len(not_eng_ge)/len(all_ge))
# print('All Gerne counts: ' , len(all_ge))

1. 評価点については`English`とのそれ以外の相関は見られない
2. ジャンルの多数の種類は`English`に属している

In [None]:
df.loc[df.Language.str.contains('English'), 'Language']  = 'English'

In [None]:
x = df.groupby('Language').mean().loc[:, ['IMDB Score']].sort_values(by='IMDB Score', ascending=False)

px.bar(x=x.index, y=x['IMDB Score'], title='Language vs Score')

In [None]:
x = df[df.Language == 'English'].groupby('Genre').mean().loc[:, ['IMDB Score']].sort_values(by='IMDB Score', ascending=False)[:8]

px.pie(values=x['IMDB Score'], labels=x.index, names=x.index, title='English popular Gerne Top 8 Rate:')

In [None]:
x = df[df.Language != 'English'].groupby('Genre').mean().loc[:, ['IMDB Score']].sort_values(by='IMDB Score', ascending=False)[:8]

px.pie(values=x['IMDB Score'], labels=x.index, names=x.index, title='Not English popular Gerne Top 8:')

In [None]:
g = df.Genre.value_counts() > 1
g = g.index
px.bar(df[(df.Genre.isin(g)) & (df['IMDB Score'] > df['IMDB Score'].median())], x='Genre', y='IMDB Score', color='Language')

Documentryは`English`の影響下にある

In [None]:
g = df.Genre.value_counts() > 1
g = g.index
px.bar(df[(df.Genre.isin(g)) & (df['IMDB Score'] > df['IMDB Score'].median())], x='Language', y='IMDB Score', color='Genre')

In [None]:
g = df.Genre.value_counts() > 1
g = g.index
px.bar(df[(df.Genre.isin(g)) & (df['IMDB Score'] < df['IMDB Score'].median())], x='Genre', y='IMDB Score', color='Language')

`Comedy`の不評は`English`の影響下にある

In [None]:
g = df.Genre.value_counts() > 1
g = g.index
px.bar(df[(df.Genre.isin(g)) & (df['IMDB Score'] < df['IMDB Score'].median())], x='Language', y='IMDB Score', color='Genre')

In [None]:
bins = [1, 5, 8 ,10]
df['score'] = pd.cut(df['IMDB Score'], bins)
language = df.Language.value_counts().index

f, axes = plt.subplots(1, 2, figsize=(18, 7))
ax = axes.ravel()
sns.countplot(data=df, x='score', ax=ax[0])
pd.crosstab(df[df.Language.isin(language)].Language, df[df.Language.isin(language)].score).plot(kind='bar', ax=ax[1])

# Runtime

In [None]:
px.scatter(x=df.Runtime,
           y=df['IMDB Score'],
           title='Runtime and Score corr by Language',
           color=df.Language)

In [None]:
print('CORR: ', np.corrcoef(df.Runtime, df['IMDB Score'])[0, 1])

In [None]:
df.groupby('score').mean().loc[:, ['Runtime']].plot(kind='bar')

In [None]:
x = df.groupby('Genre').mean().loc[:, ['Runtime']].sort_values(by='Runtime', ascending=False)
long = list(x[:50].index)
short = list(x[-50:].index)

df_long = df[df.Genre.isin(long)]
px.bar(df_long, x='Genre', y='Runtime', color='score', title='Long Runtime score:')

In [None]:
df_short = df[df.Genre.isin(short)]
px.bar(df_short, x='Genre', y='Runtime', color='score', title='Short Runtime score:')