In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [None]:
print("Dataset has " +  str(df.shape[0]) + " rows \nAnd " + str(df.shape[1]) + " columns")

In [None]:
df.head()

# Checking for Missing Values

In [None]:
missing_cols_df = df.isnull().sum()[df.isnull().sum() > 0].rename("Count of Missing Rows")
percent_missing_df = (round((missing_cols_df / df.shape[0]) * 100,2)).rename("Percent of Dataset Missing")
missing_cols_df = pd.concat([missing_cols_df,percent_missing_df], axis=1)
missing_cols_df

## Filling Missing Values
I'm going to write a function to iterate through all of the columns with missing data and then fill that column with the mode.

In [None]:
def fill_missing_values(input_df):
    missing_val_columns = input_df.isnull().sum()[input_df.isnull().sum() > 0]
    missing_val_columns = list(missing_val_columns.index)
    for col in missing_val_columns:
        mode = input_df[col].mode().iloc[0]
        print(f"Filling all values in {col} column with: {mode}")
        input_df[col] = input_df[col].fillna(mode)
    return input_df

In [None]:
df = fill_missing_values(df)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Finding Most Common Movie Ratings


In [None]:
# some of the categories have too few rows to neatly represent on a pie graph, here I'm going to combine all categories that are smaller than a specific threshold
threshold = 100
rating_counts = df.groupby('rating')['rating'].count()
other_count = rating_counts.loc[rating_counts<threshold].sum()
rating_counts = rating_counts.loc[rating_counts>=threshold]
rating_counts['Other'] = other_count

Below, we can see that the two most common ratings are TV-MA and TV-14

In [None]:
_ = rating_counts.plot.pie(explode=[0.1 for row in rating_counts.index])
_ = plt.title("Distribution of Movie Ratings")

# Plotting Lengths of Movies

In [None]:
# getting just the rows that are movies
df_movies = df.loc[df['type']=="Movie"].copy()

# removing " min" from the end of each string
df_movies['duration'] = df_movies['duration'].str.replace(" min","")

# changing the type from string to int so that I can split into 10 bins
df_movies['duration'] = df_movies['duration'].astype('int')

In [None]:
# splitting the data into 10 bins
durations = pd.cut(df_movies['duration'],10)

# setting the type from interval to be string, so that I can use the value_counts() method
durations = durations.astype('string')

# getting the value counts in each of the bins
durations = durations.value_counts()

In [None]:
# plotting as a horizontal bar plot
fig = plt.figure(figsize=(12, 8))
plt.barh(durations.index,durations)
plt.title('Frequency of Movie Lengths')
plt.xlabel('Frequency')
plt.ylabel("Movie Length")
plt.show()

# Creating Chart of the Number of Seasons

In [None]:
# getting just the rows that are movies
df_shows = df.loc[df['type']=="TV Show"].copy()

threshold = 30

# gets the count of the number of seasons for each show
duration_counts = df_shows['duration'].value_counts().astype('int')

# adds up all of the shows that have a count that is smaller than a specific threshold
smaller_than_threshold_count = duration_counts.loc[duration_counts<=threshold].sum()

# only keeps the rows with size larger than that threshold
duration_counts = duration_counts.loc[duration_counts>threshold]

# I have to add this extra step where I first call it ten+ then rename to 10+ since if I directly try to name it 10+ then python thinks it should be a number
# even though the dtype is 'str' and so it returns an error saying that 10+ is not recognized as a number
duration_counts["Six+S"] = smaller_than_threshold_count
duration_counts = duration_counts.rename({"Six+S":"6+ Seasons"})

In [None]:
_ = duration_counts.plot.pie()
_ = plt.title("Distribution of Movie Season Counts")

# Plotting the Release Years

In [None]:
# getting count for each release year
release_year_counts = df['release_year'].value_counts()

# only looking at years that have a count greater than 30, since there are a lot of years to look at these are all greater than or equal to 1998
release_year_counts = release_year_counts.loc[release_year_counts > 30]

In [None]:
# plotting as a Vertical bar plot
fig = plt.figure(figsize=(12, 8))
plt.bar(release_year_counts.index,release_year_counts)
plt.title('Frequency of Movie Lengths')
plt.xlabel('Frequency')
plt.ylabel("Movie Length")
plt.show()

# Plotting Movie Genres
The listed_in column can contain several genres. By the looks of it, I'm guessing that the closest match is the first genre listed. So I'm going to remove any genres after the comma in order to simplify it

In [None]:
# creating a new column and removing everything after the comma
df['primary_genre'] = df['listed_in'].str.split(",").str[0]

# getting count of the primary_genre
primary_genre_counts = df['primary_genre'].value_counts()

# removing anything with a size smaller than 30
primary_genre_counts = primary_genre_counts.loc[primary_genre_counts > 30]

In [None]:
# plotting as a horizontal bar plot
fig = plt.figure(figsize=(12, 8))
plt.barh(primary_genre_counts.index, primary_genre_counts)
plt.title('Most Common Movie Genres')
plt.xlabel('Counts')
plt.show()