In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics as sta

In order to analyze the Netflix data and the IMDb rating, I access to the newest IMDb data on https://www.imdb.com/interfaces/.
With two file: title_basics.tsv and title_rating.tsv

I merge thoes two data with our Netflix data to analyze some issues of Movies and Tvshow rating.

# **set dataframe display**

In [None]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)

In [None]:
data = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
t_basic = pd.read_csv('../input/imdb-data/title_basics.tsv', sep='\t')
t_rating = pd.read_csv('../input/imdb-data/title_rating.tsv', sep='\t')

# **'''Movies'''**
'''clear data from IMBN website, take out movie'''

In [None]:
t_basic = t_basic.drop_duplicates()
t_basic = t_basic[['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'startYear']]
t_basic = t_basic[t_basic.startYear.apply(lambda x: str(x).isnumeric())]

'''merge two dateframe from IMBN website, then merge it with Nerflix data'''

In [None]:
t_movies = t_basic[(t_basic.titleType == 'movie')|(t_basic.titleType == 'tvMovie')]
t = pd.merge(t_movies.set_index('tconst'), t_rating.set_index('tconst'), how='inner', left_index=True, right_index=True)
movies = pd.merge(data, t, left_on = 'title', right_on = 'primaryTitle', how='inner')

# Top 10 movies on netflix

sort all movies based on their average rating score

In [None]:
movies_rating = movies[['title', 'averageRating']]
movies_rating.sort_values('averageRating', ascending=False, inplace=True)

In [None]:
plt.figure()
plt.plot(movies_rating[:10].title, movies_rating[:10].averageRating, color='darkred')
plt.xticks(rotation=45)
plt.xlabel('movie name')
plt.ylabel('average rating')
plt.ylim(8.5, 10)
plt.title('Top 10 movies on netflix')

# Top 10 director with largest number of movie'

make a series that contains all directors' name and the number of movies they conducted

In [None]:
movies_director = movies.director.str.split(', ')
temp = pd.Series(dtype='float64')

for i in movies_director:
    temp = temp.append(pd.Series(i))

dire_10 = temp.value_counts()[:10]

dire_list = temp.value_counts().index

In [None]:
plt.figure()
plt.bar(dire_10.index, dire_10, color='salmon')

plt.title('Top 10 director with largest number of movie')
plt.ylabel('Numer of movies')
plt.xlabel('Name')
plt.xticks(rotation=90)
plt.ylim(0, 30)

# Top 10 director who has largest number of high rating movies

Find all movies with rating score above 8

In [None]:
movies_dire_rating = movies[['director', 'averageRating']]
movies_dire_rating = movies_dire_rating.dropna()
movies_dire_rating = movies_dire_rating[movies_dire_rating.averageRating >= 8]

A function that can calculate the number of movies for a given director name 

In [None]:
def dire_num(name):
    num = 0
    for x in movies_dire_rating.index:
        if name in movies_dire_rating.director[x]:
            num = num + 1
    return num

In [None]:
s = pd.Series(dtype='float64')
for i in dire_list:
    if dire_num(i) != 0:
        s = s.append(pd.Series({i : dire_num(i)}))
s = s.sort_values(ascending=False)
s = s[:10].sort_values()

In [None]:
plt.figure()

def color():
    a = []
    for i in s:
        if i >= 5:
            a.append('darkred')
        elif i > 3:
            a.append('salmon')
        else:
            a.append('teal')
    return a
    
    
plt.barh(s.index, s, color=color())

plt.title('Top 10 director who has largest number of high rating movies')
plt.xlabel('Numer of movies')
plt.ylabel('Name')
plt.show()

# Top 10 Country

get the list of all country

In [None]:
movies_country = movies.country.str.split(', ')
temp = pd.Series(dtype='float64')
for i in movies_country:
    temp = temp.append(pd.Series(i))
    
country_list = temp.value_counts().index
country_list

In [None]:
movies_country_rating = movies[['country', 'averageRating']]
movies_country_rating = movies_country_rating.dropna()

Function to calculate a given country's average rating score

In [None]:
def country_avg(name):
    a = []
    for x in movies_country_rating.index:
        if name in movies_country_rating.country[x]:
            a.append(movies_country_rating.averageRating[x])
    avg = round(sta.mean(a), 2)
    return avg

Get each country movies average rating list

In [None]:
s = []
for i in country_list:
    s.append(country_avg(i))

In [None]:
tem = pd.Series(s, index=country_list)
tem = tem.sort_values(ascending=False)

plot bar chart

In [None]:
tem = tem[:10].sort_values(ascending=True)
plt.figure()
plt.barh(tem.index, tem, height=0.6, color='salmon')
plt.xlim(6, 9)
plt.ylabel('Country')
plt.xlabel('Average Rating Score')
plt.title('Top ten country whoes movies have highest rating')
plt.show()

# TV shows

Take ou the tvshow data from IMDb data set, then merge it with Netflix data

In [None]:
t_tvshow = t_basic[(t_basic.titleType == 'tvShort')|(t_basic.titleType =='tvSeries')|(t_basic.titleType =='tvEpisode')|(t_basic.titleType =='tvMiniSeries')|(t_basic.titleType =='tvSpecial')]
t_show = pd.merge(t_tvshow.set_index('tconst'), t_rating.set_index('tconst'), how='inner', left_index=True, right_index=True)
tvshow = pd.merge(data, t_show, left_on = 'title', right_on = 'primaryTitle', how='inner')
tvshow = tvshow.drop_duplicates()
tvshow = tvshow[tvshow.type == 'TV Show']

In [None]:
for x in tvshow.index:
    tvshow.title[x] = '{} {}'.format(tvshow.title[x], tvshow.startYear[x])

# Top 10 Tvshow 

find the top 10 tvshow list

In [None]:
tv_rating = tvshow[['title', 'averageRating']]
tv_10 = tv_rating.sort_values(ascending=False, by='averageRating')[:10]

In [None]:
tv_10 = tv_10.sort_values(by='averageRating', ascending=True)

In [None]:
y_pos = np.arange(10)
plt.figure()
def color():
    color = []
    for i in tv_10.averageRating:
        if i == 10:
            color.append('darkred')
        else:
            color.append('palevioletred')
    return color

plt.barh(y_pos, tv_10.averageRating, align='center', color=color(), height=0.4)
plt.yticks(y_pos, tv_10.title)
plt.xlim(8, 11)
plt.title('Top 10 Tvshows on Netflix')
plt.xlabel('Rating score')
plt.show()

# Average Rating for Friends

sort Friends out of the tvshow data

In [None]:
a = pd.Series(dtype='float64')
for x in tvshow.index:
    if 'Friends' in tvshow.title[x]:
        a = a.append(pd.Series({tvshow.title[x]: tvshow.averageRating[x]}))
a = a[8:]
a.sort_index(inplace=True)

In [None]:
year = [1977, '', '', '', '', '', '', '', '', '', '', 1999, '', '', '', '', '', '', '', '', '', '', '', '', 2011,'', '', '', '', '', '', '', '', 2020, '']

plt.figure()
plt.plot(a, color='indianred')
plt.xticks(a.index, year)
plt.ylim(4, 10)
plt.xlabel('year')
plt.ylabel('average score')
plt.title('Rating score for Friends every season')

In [None]:
a = a.sort_values(ascending=False)
plt.figure()
plt.barh(a.index, a, color='indianred', height=0.4)
plt.title('Rating score for Friends every season')

# Friends 2018 has the highest score