# Importing libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

In [None]:
prime_df = pd.read_csv('../input/amazon-prime-tv-shows/Prime TV Shows Data set.csv', encoding = 'iso-8859-1')

In [None]:
prime_df

# Getting an overview of the dataset

In [None]:
prime_df.info()

We can see there are 503 values for each column except for IMDb rating which shows missing values. Let's check out the percentage of missing values and get insights about why these values are missing.

In [None]:
# looking for null values
prime_df.isnull().sum()

In [None]:
msno.matrix(prime_df);

In [None]:
# percentage of missing values in IMDb rating
(prime_df['IMDb rating'].isnull().sum()/prime_df.shape[0])*100

Nearly 56% of the values are missing because the users might not consider it necessary to leave a rating each time they watch a show. Maybe the one

In [None]:
prime_df['Language'].value_counts()

Most number of missing values for IMDb rating are for english shows maybe because the maximum number of shows are in english

In [None]:
nadf = prime_df[prime_df['IMDb rating'].isnull()]

In [None]:
# the distribution of na's per language
nadf['Language'].value_counts()

In [None]:
# number of na's per language / number of values per language
nadf['Language'].value_counts() / prime_df['Language'].value_counts()

In [None]:
# looking at the data types
prime_df.dtypes

In [None]:
# looking at the first 10 entries of the dataset
prime_df.head(10)

# Data cleaning and simplification

* The S.no. column is not required so let's drop it

In [None]:
# drop S.no. column
prime_df.drop(['S.no.'],axis = 1, inplace = True)
prime_df

# Data Analysis

In [None]:
sns.set_style('whitegrid')

In [None]:
#plotting the ratings by year of release
plt.figure(figsize = (16,7))
sns.barplot(x = 'Year of release', y = 'IMDb rating', data = prime_df.dropna(axis = 0, subset = ['IMDb rating']));

The ratings for shows over the years vary slightly in their values and do not follow any particular trend

In [None]:
# plotting number of shows per year
plt.figure(figsize = (16,7))
sns.countplot(y = 'Year of release', data = prime_df);

The ratings have not changed much over the years while the number of shows offered on Amzon Prime have increased at a fast rate. The number of shows before the year 2010 are below 10. The number of shows have increased by 7.5 times by the year 2018 indicating that more shows have been released online. Although there seems to be a decrease after 2018. The data for 2020 cannot be considered for the analysis as it does not take all the months into account .

In [None]:
# plotting the shows by number of seasons
plt.figure(figsize = (12,6))
sns.countplot(y = 'No of seasons available', data = prime_df, palette="Blues_d");

Shows with less number of seasons, mostly 1 or 2 seasons are the highest.

In [None]:
# plotting the shows by language
plt.figure(figsize = (12,6))
sns.countplot(y = 'Language', data = prime_df);

* The highest number of shows are in English followed by Hindi.
This might be because of the large taget audience as the most common language used by people is English and then Hindi to a considerable extent. Shows with language particular to a country or region are very less.

In [None]:
# plotting the shows by genres
plt.figure(figsize = (18,9))
sns.countplot(y = 'Genre', data = prime_df);

It is pretty evident that shows of genre Kids, Drama and Comedy are very high compared to others

In [None]:
# plotting age of viewers
sns.countplot(x = 'Age of viewers', data = prime_df);

Most shows offered are for people above 16 years or for all age groups

## Highest rated shows on Amazon Prime

In [None]:
prime_df.sort_values(by = 'IMDb rating', ascending = False).head(10)

In [None]:
(prime_df.sort_values(by = 'IMDb rating', ascending = False).head(30)).groupby(['Genre']).count()

## Lowest rated shows on Amazon Prime

In [None]:
prime_df.sort_values(by = 'IMDb rating').head(10)

In [None]:
(prime_df.sort_values(by = 'IMDb rating').head(30)).groupby(['Genre']).count()

* Among the highest rated shows, most of them belong to the genre Drama.
* A good percentage of the lowest rated shows are of genre Kids.

# Top 10 highest rated shows of 2020

In [None]:
prime_df[prime_df['Year of release'] == 2020].sort_values(by = 'IMDb rating', ascending = False).head(10)

# Top 5 highest rated English shows and Hindi shows

In [None]:
prime_df[prime_df['Language'] == 'English'].sort_values(by = 'IMDb rating', ascending = False).head(5)

In [None]:
prime_df[prime_df['Language'] == 'Hindi'].sort_values(by = 'IMDb rating', ascending = False).head(5)

# Show with maximum number of seasons

In [None]:
prime_df[prime_df['No of seasons available'] == prime_df['No of seasons available'].max()]

# Highest rated shows with 5 seasons or more
If you want to watch a long series with more than 5 seasons then probably you would have to choose from the following shows..

In [None]:
prime_df[prime_df['No of seasons available'] >= 5].sort_values(by = 'IMDb rating', ascending = False).head(10)

# The oldest show available on Amazon Prime

In [None]:
prime_df[prime_df['Year of release'] == prime_df['Year of release'].min()]