In [None]:
#Importing libraries

import re
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import random
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#reading the data
data=pd.read_csv("/kaggle/input/tv-shows-on-netflix-prime-video-hulu-and-disney/tv_shows.csv")

In [None]:
data.head()

In [None]:
#looking at the data

data.info()

This data has various TV Shows and their ratings etc. The % from rotten tomatoes must be removed and the + from age ratings must be removed. The data has to be cleaned properly to do the proper analysis. We will try to do an overall analysis of the TV Shows and understand the trends

In [None]:
#Converting the percentages to number

data['Rotten Tomatoes'] = data['Rotten Tomatoes'].str.rstrip('%').astype('float')

In [None]:
#Removing the "+" sign from age rating

data["Age"] = data["Age"].str.replace("+","")

In [None]:
#Conveting it to numeric 

data['Age'] = pd.to_numeric(data['Age'],errors='coerce')

In [None]:
#Final data

data.head()

In [None]:
#Data info

data.info()

In [None]:
#only the data will complete column values available
#later use

df=data.dropna()

# Age Analysis

In [None]:
df["Age"].value_counts()

So, 
* 18+ =  376 shows
* 16+ =  359 shows
* 7+  =  177 shows
* 13+ =  7 shows

# Analysis based on Title Names

We will analyze the title names of the TV Shows.

In [None]:
#Taking the values

titles=data["Title"].values

In [None]:
#Joining into a single string

text=' '.join(titles)

In [None]:
len(text)

In [None]:
#How it looks

text[1000:1500]

In [None]:
#Removing the punctuation

text = re.sub(r'[^\w\s]','',text)

In [None]:
len(text)

In [None]:
#Punctuation has been removed

text[1000:1500]

# Title Word Frequency

We will tokenize the words and find out the frequency of each word. This will give us an idea of how many times a word appears in a title, and also give the words which appear maximum in a title.

In [None]:
#Creating the tokenizer
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')

In [None]:
#Tokenizing the text
tokens = tokenizer.tokenize(text)

In [None]:
len(tokens)

In [None]:
#Now the words have been converted to tokens

tokens[1000:1010]

In [None]:
#now we shall make everything lowercase for uniformity
#to hold the new lower case words

words = []

# Looping through the tokens and make them lower case
for word in tokens:
    words.append(word.lower())

In [None]:
#Stop words are generally the most common words in a language.
#English stop words from nltk.

stopwords = nltk.corpus.stopwords.words('english')

In [None]:
words_new = []

#Now we need to remove the stop words from the words variable
#Appending to words_new all words that are in words but not in sw

for word in words:
    if word not in stopwords:
        words_new.append(word)

In [None]:
#The frequency distribution of the words

freq_dist = nltk.FreqDist(words_new)

In [None]:
#Frequency Distribution Plot
plt.subplots(figsize=(20,12))
freq_dist.plot(50)

* Maximum word is love, so we can say titles do have romantic/love oriented names.

* Other frequent words are world, life, adventure, wild, family, story etc.

* These are common elements of day to day life and hence they are common in the titles.

* No stopwords as we removed them.

In [None]:
#converting into string

res=' '.join([i for i in words_new if not i.isdigit()])

# **Using the regenerated text from the Tokenized words**

In [None]:
#wordcloud

plt.subplots(figsize=(16,10))
wordcloud = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          max_words=100,
                          width=1400,
                          height=1200
                         ).generate(res)


plt.imshow(wordcloud)
plt.title('TV Show Title WordCloud 100 Words')
plt.axis('off')
plt.show()

In [None]:
#wordcloud

plt.subplots(figsize=(16,10))
wordcloud = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          max_words=500,
                          width=1400,
                          height=1200
                         ).generate(res)


plt.imshow(wordcloud)
plt.title('TV Show Title WordCloud 500 Words')
plt.axis('off')
plt.show()

# Key observations-

* Love, world, girl, life, day ,family, secret etc occupy large spaces.

* As they occupy large spaces, this means they have high frequency.

* This shows that large number of tv shows will have some sort of these components.

# Using the raw text we generated earlier

In [None]:
#Lets compare the both

In [None]:
#Raw text

len(text)

In [None]:
#Text from tokenized words

len(res)

When we tokenized the words, stop words were removed, words converted to small, and hence many words were reduced.

This explains why the length of text from tokenized words is much less.

In [None]:

plt.subplots(figsize=(16,10))
wordcloud = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          max_words=100,
                          width=1400,
                          height=1200
                         ).generate(text)


plt.imshow(wordcloud)
plt.title('TV Show Title WordCloud 100 Words')
plt.axis('off')
plt.show()

In [None]:
text[1000:1500]

In [None]:

plt.subplots(figsize=(16,10))
wordcloud = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          max_words=500,
                          width=1400,
                          height=1200
                         ).generate(text)


plt.imshow(wordcloud)
plt.title('TV Show Title WordCloud 500 Words')
plt.axis('off')
plt.show()

A key difference in this case is appearance of The, Is, the, this etc. These are stopwords, in our tokenized words cases, we removed them and they thus didn't appear in the final wordcloud. But in this case, stop words were not removed, hence the problem.

# Analysis on Numeric Data

Now, let's look at the numeric data.

In [None]:
data.head()

In [None]:
data.info()

In [None]:
#overall year of release analysis

plt.subplots(figsize=(8,6))
sns.distplot(data["Year"],kde=False, color="blue")

Mainly new TV Shows, especially after 2010.

In [None]:
#overall year of release analysis

plt.subplots(figsize=(8,6))
sns.distplot(data["Age"],kde=False, color="blue")

Either ~ 5 and above or 15 and above are main trends.

# IMDb ratings

Let us look at the IMDb ratings.

In [None]:
print("TV Shows with highest IMDb ratings are= ")
print((data.sort_values("IMDb",ascending=False).head(20))['Title'])

In [None]:
#barplot of rating
plt.subplots(figsize=(8,6))
sns.barplot(x="IMDb", y="Title" , data= data.sort_values("IMDb",ascending=False).head(20))

In [None]:
print("TV Shows with lowest IMDb ratings are= ")
print((data.sort_values("IMDb",ascending=True).head(20))['Title'])

In [None]:
#barplot of rating
plt.subplots(figsize=(8,6))
sns.barplot(x="IMDb", y="Title" , data= data.sort_values("IMDb",ascending=True).head(20))

In [None]:
#Overall data of IMDb ratings

plt.figure(figsize=(16, 6))

sns.scatterplot(data=data['IMDb'])
plt.ylabel("Rating")
plt.xlabel('Movies')
plt.title("IMDb Rating Distribution")

The empty spaces are missing data points.

# Rotten Tomatoes Scores

In [None]:
print("TV Shows with highest Rotten Tomatoes scores are= ")
print((data.sort_values("Rotten Tomatoes",ascending=False).head(20))['Title'])

In [None]:
#barplot of rating
plt.subplots(figsize=(8,6))
sns.barplot(x="Rotten Tomatoes", y="Title" , data= data.sort_values("Rotten Tomatoes",ascending=False).head(20))

In [None]:
print("TV Shows with lowest Rotten Tomatoes scores are= ")
print((data.sort_values("Rotten Tomatoes",ascending=True).head(20))['Title'])

In [None]:
#barplot of rating
plt.subplots(figsize=(8,6))
sns.barplot(x="Rotten Tomatoes", y="Title" , data= data.sort_values("Rotten Tomatoes",ascending=True).head(20))

In [None]:
#Overall data of Rotten Tomatoes scores

plt.figure(figsize=(16, 6))
sns.scatterplot(data=data['Rotten Tomatoes'])
plt.ylabel("Rotten Tomatoes score")
plt.xlabel('Movies')
plt.title("Rotten Tomatoes Score Distribution")

The empty spaces are missing data points.

# Netflix

# For sake of simplicity we will work with just Netflix TV Shows
# If necessary, by changing conditions, we can analyze Hulu, Disney and Amazon as well.

In [None]:
#selecting netflix shows
netflix=data[data["Netflix"]==1]

In [None]:
print("Number of shows on Netflix= ", len(netflix))

In [None]:
plt.subplots(figsize=(8,6))
sns.distplot(netflix["Year"],kde=False, color="blue")

In [None]:
plt.subplots(figsize=(8,6))
sns.distplot(netflix["Age"],kde=False, color="blue")

In [None]:
plt.subplots(figsize=(8,6))
sns.distplot(netflix["IMDb"],kde=False, color="blue")

In [None]:
plt.subplots(figsize=(8,6))
sns.distplot(netflix["Rotten Tomatoes"],kde=False, color="blue")

**Netflix shows based on their ratings on IMDb and Rotten Tomatoes scores.**

In [None]:
print("Netflix Shows with highest IMDb ratings are= ")
print((netflix.sort_values("IMDb",ascending=False).head(10))['Title'])

In [None]:
print("Netflix Shows with lowest IMDb ratings are= ")
print((netflix.sort_values("IMDb",ascending=True).head(10))['Title'])

In [None]:
print("Netflix Shows with highest Rotten Tomatoes score are= ")
print((netflix.sort_values("Rotten Tomatoes",ascending=False).head(10))['Title'])

In [None]:
print("Netflix Shows with lowest Rotten Tomatoes score are= ")
print((netflix.sort_values("Rotten Tomatoes",ascending=True).head(10))['Title'])

# Top Netflix Shows WordCloud based on IMDb rating

In [None]:
#Taking the title and rating data

netflix1=netflix.sort_values("IMDb",ascending=False).head(100)[['Title',"IMDb"]]
netflix1.head()

In [None]:
#Converting it into a tuple

tuples_netflix_imdb = [tuple(x) for x in netflix1.values]

In [None]:
#Looks like this

tuples_netflix_imdb[0:10]

In [None]:
#Making a wordcloud

wordcloud_netflix_imdb = WordCloud(width=1400,height=1200).generate_from_frequencies(dict(tuples_netflix_imdb))

In [None]:
plt.subplots(figsize=(12,12))
plt.imshow(wordcloud_netflix_imdb)
plt.title("TV Shows based on IMDb rating(Top 100)")

# Top Netflix Shows WordCloud based on Rotten Tomatoes score

In [None]:
#Taking the title value and Rotten Tomatoes Score

netflix2=netflix.sort_values("Rotten Tomatoes",ascending=False).head(100)[['Title',"Rotten Tomatoes"]]
netflix2.head()

In [None]:
#Converting to Tuple

tuples_netflix_tomatoes = [tuple(x) for x in netflix2.values]

In [None]:
#Word Cloud generation

wordcloud_netflix_tomatoes = WordCloud(width=1400,height=1200).generate_from_frequencies(dict(tuples_netflix_tomatoes))

In [None]:
plt.subplots(figsize=(12,12))
plt.imshow(wordcloud_netflix_tomatoes)

plt.title("TV Shows based on Rotten Tomatoes Score(Top 100)")

Now, we shall analyze the ratings based on IMDB and Rotten Tomato. We shall try to undergo an unsupervised Machine Learning method, Clustering. We shall perform KMeans clustering using SKlearn library.

 # TV Show Clustering based on ratings

In [None]:
#Taking the relevant data

ratings=data[["Title",'IMDb',"Rotten Tomatoes"]]
ratings.head()

In [None]:
len(ratings)

In [None]:
ratings.info()

We see that there are many data which is missing. But for undergoing the process, we need complete data. So we shall delete the data which has missing values and only work on data that is complete.

In [None]:
#Removing the data

ratings=ratings.dropna()

Important thing is that IMDB data is on a scale of 0-10, and Rotten Tomatoes data is on a scale of 0-100. But data on different scales might not lead to proper clustering, so we convert the IMDb data into a scale of 0-100, that is, we shall multiply it by 10.

In [None]:
ratings["IMDb"]=ratings["IMDb"]*10

In [None]:
#New data

ratings.head()

In [None]:
#Input data

X=ratings[["IMDb","Rotten Tomatoes"]]

In [None]:
X.head()

In [None]:
#Scatterplot of the input data

plt.figure(figsize=(10,6))
sns.scatterplot(x = 'IMDb',y = 'Rotten Tomatoes',  data = X  ,s = 60 )
plt.xlabel('IMDb rating (multiplied by 10)')
plt.ylabel('Rotten Tomatoes') 
plt.title('IMDb rating (multiplied by 10) vs Rotten Tomatoes Score')
plt.show()

KMeans is one of the simple but popular unsupervised learning algorithm. Here K indicates the number of clusters or classes the algorithm has to divide the data into. The algorithm starts with first group of randomly selected centroids, which are used as the beginning points for every cluster. It performs repetitive calculations to optimize the positions of the centroids.

In [None]:
#Importing KMeans from sklearn

from sklearn.cluster import KMeans


Now we calculate the Within Cluster Sum of Squared Errors (WSS) for different values of k. Next, we choose the k for which WSS first starts to diminish.

In [None]:
wcss=[]

for i in range(1,11):
    km=KMeans(n_clusters=i)
    km.fit(X)
    wcss.append(km.inertia_)

In [None]:
#The elbow curve

plt.figure(figsize=(12,6))

plt.plot(range(1,11),wcss)

plt.plot(range(1,11),wcss, linewidth=2, color="red", marker ="8")

plt.xlabel("K Value")
plt.xticks(np.arange(1,11,1))
plt.ylabel("WCSS")

plt.show()

In [None]:
#this is known as the elbow graph , the x axis being the number of clusters
#the number of clusters is taken at the elbow joint point
#this point is the point where making clusters is most relevant
#the numbers of clusters is kept at maximum

In [None]:
#Taking 4 clusters

km=KMeans(n_clusters=4)

In [None]:
#Fitting the input data

km.fit(X)

In [None]:
#predicting the labels of the input data

y=km.predict(X)

In [None]:
#adding the labels to a column named label

ratings["label"] = y

In [None]:
#The new dataframe with the clustering done

ratings.head()

In [None]:
#Scatterplot of the clusters

plt.figure(figsize=(10,6))
sns.scatterplot(x = 'IMDb',y = 'Rotten Tomatoes',hue="label",  
                 palette=['green','orange','red',"blue"], legend='full',data = ratings  ,s = 60 )

plt.xlabel('IMDb rating(Multiplied by 10)')
plt.ylabel('Rotten Tomatoes score') 
plt.title('IMDb rating(Multiplied by 10) vs Rotten Tomatoes score')
plt.show()

# Analysis

* The cluster at the top are surely the best TV Shows, they have high scores by both IMDb and Rotten Tomatoes.

* The middle two are good and average TV Shows. There are outliers, and in some cases, some TV Shows have been rated high by one, but rated low by the other. 

* The outliers are mainly caused by the fact that, say IMDb rated them well, but Rotten Tomatoes rated them badly.

* The bottom cluster is usually the TV Shows with bad ratings by both, but there are some outliers.

In [None]:
print('Number of Cluster 0 TV Shows are=')
print(len(ratings[ratings["label"]==0]))
print("--------------------------------------------")
print('Number of Cluster 1 TV Shows are=')
print(len(ratings[ratings["label"]==1]))
print("--------------------------------------------")
print('Number of Cluster 2 TV Shows are=')
print(len(ratings[ratings["label"]==2]))
print("--------------------------------------------")
print('Number of Cluster 3 TV Shows are=')
print(len(ratings[ratings["label"]==3]))
print("--------------------------------------------")

# Final Results.

The TV Show names printed.

In [None]:
print('TV Shows in cluster 0')

print(ratings[ratings["label"]==0]["Title"].values)

In [None]:
print('TV Shows in cluster 1')

print(ratings[ratings["label"]==1]["Title"].values)

In [None]:
print('TV Shows in cluster 2')

print(ratings[ratings["label"]==2]["Title"].values)

In [None]:
print('TV Shows in cluster 3')

print(ratings[ratings["label"]==3]["Title"].values)

Thank You.