# Netflix Analytics
**Task** : To Analyze and get insights on the Netflix Show Data provided.

![](https://variety.com/wp-content/uploads/2020/05/netflix-logo.png?w=1024)

Start by importing necessary libraries for data storage and manipulation, graph and visualization generation and language processing.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import folium
from folium.plugins import MarkerCluster
from geopy import geocoders
!pip install squarify
import squarify
from textblob import TextBlob

Read data from the folder. Take a look at the data to see what insights can be taken from it.

In [None]:
data = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
data

<h2>Shows Type and its count</h2> <br>
Show types refers to two categories - Movies and TV Shows. Let us take a look at what Netflix got more in its choices.


In [None]:
types,counts = np.unique(data.type,return_counts=True)
plt.title("Show Type")
plt.xlabel("Show Types")
plt.ylabel("Count")
plt.bar(types,counts)

<h2>Geospatial Data Visualization</h2>
Geospatial data helps us observer from which location more releases have been there or from which location the shows come from. Since there are mulitple comma seperated entries in this field, let us split the values and take the count of each entry. Once the processing is over, let us plot the values on folium map to see the geospatial data.


In [None]:
countries = list(data.country.dropna())
countries_filter = []
for i in countries:
            for entry in i.split(","):
                countries_filter.append(str(entry.strip()))
for i in countries_filter:
    if i == '' or i == ' ':
        countries_filter.remove(i)
countries_filter,count = np.unique(countries_filter,return_counts=True)


lat = []
lon = []

from geopy.geocoders import Nominatim
geocoder = Nominatim(user_agent = 'Netflix Analytics')
for i in countries_filter:
    try:
        lat.append(geocoder.geocode(i).raw['lat'])
        lon.append(geocoder.geocode(i).raw['lon'])
    except:
        print("Not locatable : ",i)

world_map = folium.Map(tiles='cartodbpositron')
marker_cluster = MarkerCluster().add_to(world_map)
for i in range(len(countries_filter)):
    folium.CircleMarker(location=[lat[i],lon[i]],radius=5,popup=count[i],fill=True).add_to(marker_cluster)
world_map


<h2>Year wise analysis</h2><br>
Netflix was established years back but its demand only came recently. As time passed since its establishment, more and more shows started to get uploaded. Let us take a look at the time when there was a peak in the uploading of content on Netflix.

In [None]:
year = []
for i in data['date_added'].dropna():
    year.append(int(i.split(",")[1]))
year,count = np.unique(year,return_counts=True)
plt.figure(figsize=(12, 6), dpi=80)
plt.xticks(np.arange(min(year), max(year)+1, 1.0))
plt.plot(year,count)
plt.xlabel("Year")
plt.ylabel("Count")
plt.title("Year wise upload")
plt.show()

<h2>Month wise Analysis</h2><br>
Let us break down the time series analysis into a finer level - Month wise Analysis.

In [None]:
month = []
for i in data['date_added'].dropna():
    month.append(i.split(",")[0].split(" ")[0].strip())
    
month,count = np.unique(month,return_counts=True)

#Removing the NULL value at first
month = month[1:]
count = count[1:]

#Sorting months and its count to get ordered graph
for i in range(len(month)):
  for i in range(len(month)-1):
    if count[i] > count[i+1]:
        temp = count[i]
        count[i] = count[i+1]
        count[i+1] = temp
        
        temp = month[i]
        month[i] = month[i+1]
        month[i+1] = temp

plt.figure(figsize=(12, 6), dpi=80)
plt.bar(month,count)
plt.xlabel("Month")
plt.ylabel("Count")
plt.title("Month wise upload")
plt.show()


<h2>Release Year Analysis</h2><br>
Netflix contains a wide variety of movies and TV shows. Let us  take a look into the amount of shows that Netflix has for each year.

In [None]:
ry = data['release_year']
print(ry.min(),ry.max())
ry,count = np.unique(ry,return_counts=True)
fig, ax = plt.subplots(1, figsize = (22,22))
squarify.plot(sizes=count, label=ry, alpha=0.6 )
plt.axis('off')
plt.show()

<h2>Rating Analysis</h2><br>
Let us look at the number of ratings to understand what type of shows are in majority.

In [None]:
rating = data['rating']
r_u=[]
c_rating = []
for i in rating.dropna():
    if i not in r_u:
        r_u.append(i)
        
for i in r_u:
    c_rating.append(list(rating).count(i))
    
plt.figure(figsize=(15, 5), dpi=500)
plt.plot(r_u,c_rating)
plt.show()

<h2>Category Analysis</h2><br>
Movies and Shows are always categorised into certain categories in accordance with their content.

In [None]:
lin = data['listed_in'].dropna()
lin_filter = []
lin_count = []
lin_unique = []

for item in lin:
    for entry in item.split(","):
            lin_unique.append(entry.strip())
for item in lin_unique:
    if item not in lin_filter:
        lin_filter.append(item)

for item in lin_filter:
    lin_count.append(lin_unique.count(item))

fig, ax = plt.subplots(1, figsize = (52,22))
squarify.plot(sizes=lin_count, label=lin_filter)
plt.axis('off')
plt.show()
fig.savefig("temps")

<h2>Show Duration Analysis</h2><br>
Netflix show duration can be measured in mins as well as in Seasons. Seasons are group of episodes that are released periodically until the show ends. With this mixed timings, let us split the duration analysis among mins and Seasons.

In [None]:
mins = []
seasons = []
for item in data['duration'].dropna():
    if item.find("min") > -1:
        mins.append(int(item.split(" ")[0]))
    if item.find("Seasons") > -1 or item.find("Season") > -1:
        seasons.append(int(item.split(" ")[0]))
print("Minimum Episode time : ",np.unique(mins).min(),"mins")
print("Maximum Episode time : ",np.unique(mins).max()/60, "hours")
print("Average Episode time : {:.2f}".format(np.unique(mins).mean()/60),"hours")
print()
print("Minimum Seasons : ",np.unique(seasons).min(),"Season")
print("Maximum Seasons : ",np.unique(seasons).max(),"Seasons")
print("Average Seasons : ",int(np.unique(seasons).mean()),"Seasons")

    

<h2>Show sentiment Analysis</h2>
With the description of the show provided, let us have an estimation on what theme the show is based on - Negative, Positive or Neutral. <br>
<b>Note : </b> <i>This part is not done to bring down the rating of any show or movie. A negative theme necessarily doesn't mean that the show or movie is no good.</i>

In [None]:
p_tweets=0
n_tweets=0
ng_tweets=0

for entry in data['description'].dropna():
    if TextBlob(entry).polarity > 0:
        p_tweets = p_tweets + 1
    if TextBlob(entry).polarity == 0:
        n_tweets = n_tweets + 1
    if TextBlob(entry).polarity < 0:
        ng_tweets = ng_tweets + 1
        
print("Positive Tweets : {:.2f}".format(p_tweets / len(data) * 100),"%")
print("Neutral Tweets  : {:.2f}".format(n_tweets / len(data) * 100),"%")
print("Negative Tweets : {:.2f}".format(ng_tweets / len(data) * 100),"%")

<h2>Country-based querying</h2><br>
Till now, we have done all the basic analysis of the dataset. Let us now dig more into the data by querying specific data. The parameter we will be using is Country. This helps us understand what content is available in each country.


In [None]:
#get area from user
cnt = input("Enter country : ")
queried_data = data[data['country'].str.find(cnt) > -1]
queried_data

With this data, repeat all the above analytics once more to get country-wise analytics

In [None]:
types,counts = np.unique(queried_data.type,return_counts=True)
plt.title("Show Type Analytics for "+ cnt)
plt.xlabel("Show Types")
plt.ylabel("Count")
plt.bar(types,counts)

In [None]:
year = []
for i in queried_data['date_added'].dropna():
    year.append(int(i.split(",")[1]))
year,count = np.unique(year,return_counts=True)
plt.figure(figsize=(12, 6), dpi=80)
plt.xticks(np.arange(min(year), max(year)+1, 1.0))
plt.plot(year,count)
plt.xlabel("Year")
plt.ylabel("Count")
plt.title("Year wise upload for " + cnt)
plt.show()

In [None]:
month = []
for i in queried_data['date_added'].dropna():
    month.append(i.split(",")[0].split(" ")[0].strip())
    
month,count = np.unique(month,return_counts=True)

#Removing the NULL value at first
month = month[1:]
count = count[1:]

#Sorting months and its count to get ordered graph
for i in range(len(month)):
  for i in range(len(month)-1):
    if count[i] > count[i+1]:
        temp = count[i]
        count[i] = count[i+1]
        count[i+1] = temp
        
        temp = month[i]
        month[i] = month[i+1]
        month[i+1] = temp

plt.figure(figsize=(12, 6), dpi=80)
plt.bar(month,count)
plt.xlabel("Month")
plt.ylabel("Count")
plt.title("Month wise upload for "+cnt)
plt.show()


In [None]:
ry = queried_data['release_year']
print("Oldest release : ",ry.min())
print("Latest release : ",ry.max())
ry,count = np.unique(ry,return_counts=True)
fig, ax = plt.subplots(1, figsize = (22,22))
squarify.plot(sizes=count, label=ry, alpha=0.6 )
plt.title("Release year Analysis for "+cnt)
plt.axis('off')
plt.show()

In [None]:
rating = queried_data['rating']
r_u=[]
c_rating = []
for i in rating.dropna():
    if i not in r_u:
        r_u.append(i)
        
for i in r_u:
    c_rating.append(list(rating).count(i))
    
plt.figure(figsize=(15, 5), dpi=500)
plt.title("Rating Analysis for "+cnt)
plt.plot(r_u,c_rating)
plt.show()

In [None]:
lin = queried_data['listed_in'].dropna()
lin_filter = []
lin_count = []
lin_unique = []

for item in lin:
    for entry in item.split(","):
            lin_unique.append(entry.strip())
for item in lin_unique:
    if item not in lin_filter:
        lin_filter.append(item)

for item in lin_filter:
    lin_count.append(lin_unique.count(item))

fig, ax = plt.subplots(1, figsize = (52,22))
plt.title("Category Analysis for "+cnt)
squarify.plot(sizes=lin_count, label=lin_filter)
plt.axis('off')
plt.show()
fig.savefig("temps")

In [None]:
mins = []
seasons = []
for item in queried_data['duration'].dropna():
    if item.find("min") > -1:
        mins.append(int(item.split(" ")[0]))
    if item.find("Seasons") > -1 or item.find("Season") > -1:
        seasons.append(int(item.split(" ")[0]))
print("Show duration analysis for "+cnt)
print("Minimum Episode time : ",np.unique(mins).min(),"mins")
print("Maximum Episode time : ",np.unique(mins).max()/60, "hours")
print("Average Episode time : {:.2f}".format(np.unique(mins).mean()/60),"hours")
print()
print("Minimum Seasons : ",np.unique(seasons).min(),"Season")
print("Maximum Seasons : ",np.unique(seasons).max(),"Seasons")
print("Average Seasons : ",int(np.unique(seasons).mean()),"Seasons")

    

In [None]:
p_tweets=0
n_tweets=0
ng_tweets=0

for entry in queried_data['description'].dropna():
    if TextBlob(entry).polarity > 0:
        p_tweets = p_tweets + 1
    if TextBlob(entry).polarity == 0:
        n_tweets = n_tweets + 1
    if TextBlob(entry).polarity < 0:
        ng_tweets = ng_tweets + 1
print("Show sentiment Analysis for "+cnt)
print("Positive Tweets : {:.2f}".format(p_tweets / len(queried_data) * 100),"%")
print("Neutral Tweets  : {:.2f}".format(n_tweets / len(queried_data) * 100),"%")
print("Negative Tweets : {:.2f}".format(ng_tweets / len(queried_data) * 100),"%")