In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
movies = pd.read_csv('../input/netflix-original-films-imdb-scores/NetflixOriginals.csv')
movies.head()

In [None]:
movies.info()

In [None]:
movies.describe()

In [None]:
movies.describe(include='object')

In [None]:
movies['Premiere']=pd.to_datetime(movies['Premiere'], dayfirst=True)
movies.head()

In [None]:
movies.Language.unique()

In [None]:
movies.Genre.unique()

### We have examined what data is, now it needs to be transformed in order to consider it in more detail.

Converting the date

In [None]:
movies['Year'] = movies['Premiere'].apply(lambda x: x.year)

months = {1: 'January', 2: 'February', 3: 'March', 4: 'April',
         5: 'May', 6: 'June', 7: 'July', 8: 'August',
         9: 'September', 10: 'October', 11: 'November', 12: 'December'}
movies['Month'] = movies['Premiere'].apply(lambda x: months[x.month])

movies['Day'] = movies['Premiere'].apply(lambda x: x.day)

In [None]:
movies = movies.drop('Premiere', axis=1)
movies.head()

In [None]:
movies['Year'].value_counts()

In [None]:
print(movies.groupby('Year')['IMDB Score'].min())
print(movies.groupby('Year')['IMDB Score'].mean())
print(movies.groupby('Year')['IMDB Score'].max())

In [None]:
movies['Genre'].value_counts().head(10)

In [None]:
movies['Language'].value_counts().head(10)

In [None]:
movies[movies['IMDB Score'] >= 8].value_counts()

In [None]:
movies[movies['Runtime'] >= 120].value_counts()

In [None]:
movies[movies['Runtime'] <= 60].value_counts()

In [None]:
movies['IMDB Score'].value_counts()

In [None]:
_, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(movies.corr(), annot=True, linewidths=.5, fmt= '.1f', ax=ax)

plt.title('Correlation', fontsize = 30)
plt.xlabel('Features', fontsize = 15)
plt.ylabel('Features', fontsize = 15)

plt.show()

In [None]:
sns.pairplot(movies, size = 3)
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(14, 8)})

ax = sns.countplot(movies['Year'], palette='rainbow')
ax.set_title('Number of new movies per year')
plt.xlabel('Year')
plt.ylabel('Number of movies')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(16, 8)})


ax = sns.swarmplot(x='Year',y='Runtime',data=movies,palette='rainbow')
ax.set_title('Runtime by years')
plt.xlabel('Year')
plt.ylabel('Runtime')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(16, 8)})


ax = sns.swarmplot(x='Year',y='IMDB Score',data=movies,palette='rainbow')
ax.set_title('IMDB Score by years')
plt.xlabel('Year')
plt.ylabel('IMDB Score')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(14, 8)})

ax = sns.countplot(movies['Month'], palette='rainbow')
ax.set_title('Number of new movies by month')
plt.xlabel('Month')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(14, 8)})

ax = sns.countplot(movies['Day'], palette='rainbow')
ax.set_title('Number of new movies by day')
plt.xlabel('Day')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(20, 10)})

sns.countplot(x=movies['Language'], log=True, palette='rainbow')
plt.xticks(rotation=90)

plt.title('Number of Languages')
plt.xlabel('Language')
plt.ylabel('Number of Languages')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(18, 10)})


ax = sns.swarmplot(x='Genre', y='Runtime', data=movies, palette='rainbow')
ax.set_title('Runtime by Genres')

plt.xticks(rotation=90)
plt.xlabel('Genres')
plt.ylabel('Runtime')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(18, 10)})


ax = sns.swarmplot(x='Genre', y='IMDB Score', data=movies, palette='rainbow')
ax.set_title('IMDB Score by Genres')

plt.xticks(rotation=90)
plt.xlabel('Genres')
plt.ylabel('IMDB Score')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(20, 10)})

sns.countplot(x=movies['Genre'], log=True, palette='rainbow')
plt.xticks(rotation=90)

plt.title('Number of Genres')
plt.xlabel('Genre')
plt.ylabel('Number of Genres')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x=movies['IMDB Score'], y=movies['Runtime'])
plt.grid()

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x=movies['IMDB Score'], y=movies['Year'])
plt.grid()

In [None]:
plt.figure(figsize=(10,20))
sns.scatterplot(x=movies['IMDB Score'], y=movies['Genre'])
plt.grid()

In [None]:
plt.figure(figsize=(10,20))
sns.scatterplot(x=movies['IMDB Score'], y=movies['Language'])
plt.grid()

In [None]:
sns.swarmplot(x='Year', y='Runtime', data=movies, palette='rainbow')

### Consider the most common genre:

 - Documentary        159
 - Drama               77
 - Comedy              49
 - Romantic comedy     39
 - Thriller            33
 - Comedy-drama        14
 - Crime drama         11
 - Biopic               9
 - Horror               9
 - Action               7

In [None]:
top_genre = movies[((movies['Genre'] == 'Documentary')|(movies['Genre'] == 'Drama')|(movies['Genre'] == 'Comedy'))]
top_genre.info()

In [None]:
sns.factorplot(x='Year', y='IMDB Score', hue='Genre', data=top_genre, kind='bar', palette='rainbow')

plt.title('Rating of Documentary movies by year')
plt.xlabel('Year')
plt.ylabel('IMDB Score')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(16, 16)})

sns.factorplot(x='Year', y='Runtime', hue='Genre', data=top_genre, kind='bar', palette='rainbow')

plt.title('Duration of Documentary movies by year')
plt.xlabel('Year')
plt.ylabel('Runtime')
plt.show()

In [None]:
sns.boxplot(x="Year", y="IMDB Score", hue="Genre", palette='rainbow', data=top_genre)
plt.title('Scores of Documentary movies by year')
plt.xlabel('Year')
plt.ylabel('IMDB Score')
plt.show()