In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/netflix-original-films-imdb-scores/NetflixOriginals.csv')
print(data.head())

In [None]:
#Printing the Shape of the Dataset.
data.shape

In [None]:
#Checking wheather the Dataset contains any null values in any columns.
data.isnull().sum()

In [None]:
#Checking the First 10 rows of the Dataset.
data.head(10)

In [None]:
#checking the Last 10 rows of the Dataset.
data.tail(10)

In [None]:
#Checking the index,Datatypes and memory information of the Dataset
data.info()

In [None]:
#Checking the statistical information of numerical columns
data.describe()

In [None]:
#Checking the column names
data.columns

In [None]:
#Printing the unique value counts of Genre column
genre = data['Genre'].value_counts()
genre

In [None]:
#Selecting the column from the dataset.
data['Title']

In [None]:
#Selecting multiple columns from Dataset.
data[['Title','IMDB Score','Genre','Runtime']]

In [None]:
#Sorting the dataset by tilte.
sorted_by_title = data.sort_values(by='Title')

In [None]:
#Selecting by index. loc() fetches the value based on index of the row in the dataset.
sorted_by_title.loc[0]

In [None]:
#Selecting by position. iloc() fetches the value based on position of the row in the dataset.
sorted_by_title.iloc[0]

In [None]:
#Fetching the row from the dataset.
data.iloc[0,:]

In [None]:
#Fetching the specific cell from the dataset.
data.iloc[1,0]

# Filtering,Sorting and Grouping Operations

In [None]:
#All the Titles whose runtime > 100
data[data['Runtime']>100]['Title']

In [None]:
#All the Titles whose IMDB Score is <= 6
data[data['IMDB Score']<=6]['Title']

In [None]:
#Ttiles which starts with 'A' character
data[data['Title'].str.startswith('A')]['Title']

In [None]:
#Titles which contains Love in the name.
data[data['Title'].str.contains('Love')]['Title']

In [None]:
#Title Names which endswith Love.
data[data['Title'].str.endswith('Love')][['Title','IMDB Score']]

In [None]:
#Filtering based on two columns
data[(data['Runtime']>100) & (data['IMDB Score']>=7)]

# Sorting Operations

In [None]:
#Sorting based on Runtime. Note use inplace=True if this change needs to be reflected in the dataset.
data.sort_values('Runtime',ascending=True)

In [None]:
#Sorting based on 2 conditions
data.sort_values(['Title','IMDB Score'],ascending=[True,False])

# Grouping Operations

In [None]:
#Grouping based on 'Language'
data.groupby(['Language','Genre']).groups

# DataTime Operations On Premiere Column

In [None]:
#Converting Premiere column to datetime format
data['Premiere'] = pd.to_datetime(data['Premiere'])

In [None]:
#Checking the month from Premiere
months = data['Premiere'].apply(lambda x:x.month)
months

In [None]:
#Checking days from Premiere
days = data['Premiere'].apply(lambda x:x.day)
days

In [None]:
#Checking the years from Premiere
years = data['Premiere'].apply(lambda x:x.year)
years

In [None]:
#Now Adding days,months and years in dataframe
data['Day'] = days
data['Month'] = months
data['Year'] = years

In [None]:
#Check the day,month and year column has added to dataframe
data.head(10)

# Data Vizualization

In [None]:
#No. of movies released by year

plt.figure(figsize=(12,12))
sns.countplot(x='Year',data=data)
plt.title('No. of Movies by Year')


In [None]:
plt.figure(figsize=(12,12))
data['Language'].value_counts().plot(kind='bar')

In [None]:
sns.pairplot(data)

In [None]:
sns.heatmap(data.corr(),annot=True)

In [None]:
plt.figure(figsize=(12,5))
sns.lineplot(x='Month',y='IMDB Score',data=data,ci=None)

In [None]:
movies_year = data.query("Month == 12")
sns.lineplot(data=movies_year,x='Year',y='IMDB Score',ci = None)

In [None]:
sns.scatterplot(x='IMDB Score',y='Runtime',data=data)
# Most of the Movies Runtime lies between 50-150 and most of the movies have IMDB Score in between 5-8