In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



Import and show Data

In [None]:
#Import dataset
data = pd.read_csv("../input/netflix-shows/netflix_titles.csv")

#Output head
data.head()

# Step 1 - Insights into the Data:

(a) Generate a dataset by removing those rows of the dataset for which the value of "release_year" is equal or less than 2000. Let's call this data set "old_movie_data".

In [None]:
#create restricted dataset
old_movie_data = data[data['release_year']<=2000]
#display head
old_movie_data.head()

(b) Use a boxplot to find and remove the outliers from "release_year". Note that based on the boxplot the values greater than the upper-whisker and lower than the lower-whisker are considered as outliers. Let's call the dataset after removing the outliers "cleaned_data".

In [None]:
plt.boxplot(old_movie_data.release_year,sym="r+")

In [None]:
#cleaned_Data
Q1 = old_movie_data['release_year'].quantile(0.25)
Q3 = old_movie_data['release_year'].quantile(0.75)
IQR = Q3 - Q1    #IQR is interquartile range. 

filter = (old_movie_data['release_year'] >= Q1 - 1.5 * IQR) & (old_movie_data['release_year'] <= Q3 + 1.5 *IQR)
cleaned_data = old_movie_data.loc[filter] 


cleaned_data

(c) Compare basic statistical features of "release_year" (median, mean, and mode, standard deviation, variance) in the "data" and "cleaned_data" datasets. Interpret the differences for these statistical values between the "cleaned_data" and "data" datasets. Explain why the statistics of these two datasets are different.

In [None]:
describe_data= data.release_year.describe()
describe_cleanedData= cleaned_data.release_year.describe()
print("Dataset data: \n",describe_data ,"\n \ncleaned_data:\n", describe_cleanedData)

As it can be seen, the mean of data from data is decreased for cleaned data, which is almost obvious as we removed outliers data and so we have less data. The standard diviation also decreased in clean_data which means the scaterring of data is reducesd because we droped the outlier data and cleaned data is more integrated. The min have not changed but the max value also decresed, as we don't have outliers and the data which are out of the limitation (in the box plot) have been deleted. Generally, we can conclude that the data in cleaned_data is more integrated and the mean of the whole data has decresed.


# Step 2: Basic Visualization

(d) Visualize mean and median of "release_year" in the cleaned dataset. Specify the "rating" values for which the mean and median of "release_year" is maximal and for which it is minimal.

In [None]:
categories=np.unique(cleaned_data['rating'].astype(str))
x = np.arange(len(categories))

mean_surface=[]
median_surface=[]
#calculate mean and median of each category
for i in categories:
    mean=cleaned_data.release_year[cleaned_data['rating']==i].mean()
    median=cleaned_data.release_year[cleaned_data['rating']==i].median()
    mean_surface.append(mean)
    median_surface.append(median)  

#find & output min & max
max_mean_surface=max(mean_surface)
max_median_surface=max(median_surface)

min_mean_surface= min(mean_surface)
min_median_surface=min(median_surface)

print ('Maximum of mean =', max_mean_surface)
print('Maximum of median=', max_median_surface)
print ('Minimum of mean =', min_mean_surface)
print ('Maximum of median =', min_median_surface)

#setup graph
width=0.45
fig,ax_bar=plt.subplots(figsize=(20,10))
bar1=ax_bar.bar(x-width/2,mean_surface,width,label='Mean',align='center')
bar2= ax_bar.bar(x+ width/2,median_surface,width,label='Median', align='center')
ax_bar.set_xticks(x)
ax_bar.set_xticklabels(categories)
ax_bar.legend()

(e) Plot the distribution of "release_year" in the data and cleaned_data datasets.

In [None]:
#Plot new_sampled_data
sns.displot(data['release_year'])
plt.title("new_sampled_data")

#Plot cleaned_data
sns.displot(cleaned_data['release_year'])
plt.title("cleaned_data")

(f) Explore the distribution of "release_year" and "rating" together in the data and cleaned_data datasets. Specify the ranges of "release_year" and "rating" for which the frequency of the data is the highest.

In [None]:


data_encoded_for_data = data.apply(LabelEncoder().fit_transform)

data_encoded_for_data


In [None]:
data_encoded_for_cleaned_data = cleaned_data.apply(LabelEncoder().fit_transform)

data_encoded_for_cleaned_data

In [None]:
#Plot data
sns.displot(data_encoded_for_data, x="release_year", y="rating", kind= 'kde',cbar=True,rug=True)
plt.title("Dataset: data")

#Plot cleaned_data
sns.displot(data_encoded_for_cleaned_data, x="release_year", y="rating", kind= 'kde',cbar=True,rug=True)
plt.title("Dataset: cleaned_data")

g) Which Country has made most Movie or TV Show?

In [None]:
#Making new DataFrame by using country and type features
count_type = data.groupby(['country']).count()
most_country = count_type['type'].to_frame().reset_index().sort_values(by='type', ascending=False)[:10]

#Visualizing using seaborn
plt.figure(figsize=(15,5))
sns.set_context('paper', font_scale=1.2)
sns.barplot(x='country', y='type', data=most_country)
plt.ylabel('# of TV & Movie')
plt.xlabel('Country')
plt.show()