# <center>Netflix - movies and TV shows</center>
<p> This dataset consists of tv shows and movies available on Netflix as of 2019.  </p>

In [None]:
#Load required libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import matplotlib.lines as  lines
from IPython.display import HTML
from wordcloud import WordCloud
import datetime
from tqdm import tqdm
from pandas import option_context  #for changing the display column width of dataframe
import calendar

In [None]:
#Visualization settings
sns.set_style(style='white')
sns.set(rc={
    'figure.figsize': (12,7),
    'axes.facecolor': 'white',
    'axes.grid': True,
    'grid.color': '.9',
    'axes.linewidth': 1.0,
    'grid.linestyle': u'-'},
    font_scale=1.5)
custom_colors=["#3498db", "#95a5a6","#34495e", "#2ecc71", "#e74c3c"]
sns.set_palette(custom_colors)
background_color='#fbfbfb'

In [None]:
#Read datafile
df_input=pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [None]:
print ("\nSample dataframe\n")
display(df_input.head(3))
print (f"Dataframe shape: {df_input.shape}\n")
col_tags="<ol><b>" + "".join([f"<li>{col}</li>" for col in df_input.columns]) + "</b></ol>"
display(HTML("<b><u>Feature names</u></b>"))
display(HTML(col_tags))
print ("Dataset summary")
display (df_input.info())

<h3 style="background-color:yellow">
    11 String features and 1 integer feature.
    </h3>

In [None]:
#Data frame for missing values
val=df_input.isnull().sum()
val.sort_values(inplace=True, ascending=False)
df=pd.DataFrame(columns=["Features","Missing values"])
df["Features"]=val.index
df["Missing values"]=val.values
df.drop(df[df["Missing values"]==0].index, inplace=True)

#Create plot for missing values 
fig=plt.figure(figsize=(15,7));

ax0=fig.add_subplot(1,2,1)
ax1=fig.add_subplot(1,2,2)
ax1.grid(False)
ax1.set_xticklabels([])
ax1.set_yticklabels([])

fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
#ax0.set_title("Missing values in data")
ax1.set_facecolor(background_color)

sns.barplot(data=df,x="Features",y="Missing values",ax=ax0);
#rotating the ticklabels in x axis
for tick in ax0.get_xticklabels():
    tick.set_rotation(90)

#Draw line in the middle    
l1= lines.Line2D([0.5,0.5],[0.1, 0.9],color='black',lw=0.2,transform=fig.transFigure)
fig.lines.extend([l1])

#Description
fig.text(x=0.6,
        y=0.8,
        s="Missing values in data",
        fontweight='bold',
        fontfamily='serif',
         fontsize=17,
        color='grey')
fig.text(x=0.47,
        y=0.5,
        fontweight='light',
        fontfamily='serif',
         fontsize=17,
        color='grey',
        s='''
        Director,Cast,Country,date_added and rating\n
        features have missing values.\n
        30% Director feature having missing value.
        ''')
plt.show()

display(df)

# <center>Exploratory Data Analysis</center>

In [None]:
#Discrete feature analysis
def analyze_discrete_feature(fld,display_graph=True):
    print ("Sample data:\n")
    display(fld.head())
    df=pd.DataFrame({"Value": fld.value_counts().index,
                 "Count":fld.value_counts().values})
    print ("\nNull value count : ", fld.isnull().sum())
    unique_list=fld.unique().tolist()
    print ("\nUnique values: ", unique_list)
    print ("\n Unique values count: ", len(unique_list))
    print ("\nValue counts:\n",    df)
    print (display_graph)
    if display_graph==True:
        plt.subplots(figsize=(25,10),facecolor=background_color)
        plt.subplot(2,2,1)
        plt.pie(fld.value_counts(),labels=fld.value_counts().index,autopct=lambda x: f'{x: .2f}%');
        plt.xticks(rotation=90)

        plt.subplot(2,2,2)   
        sns.barplot(data=df, x="Value",y="Count").set_facecolor(background_color);
        plt.xticks(rotation=90);
        plt.suptitle(fld.name + " -distribution");

        plt.show()
        plt.close()
    display(HTML("<h4 style='background-color:#fbfbfb;font-family:serif;font-size:160%'>Discrete variable</h4>"))

In [None]:
#Continuous feature analysis
def analyze_continuous_feature(fld):
    print ("Null value count : ", fld.isnull().sum())
    print ("\n", fld.describe())
    plt.subplots(figsize=(25,10))
    plt.subplot(2,2,1)
    plt.hist(fld)
    plt.subplot(2,2,2)
    sns.boxplot(fld)
    plt.suptitle("fld.name + -distribution")
    plt.show()
    plt.close()
    display(HTML("<h4 style='background-color:#fbfbfb;font-family:serif;font-size:160%'>Continuous variable</h4>"))

# 1. show_id

In [None]:
print ("Sample data:\n")
display(df_input.show_id.head())

<h4 style='background-color:#fbfbfb;font-family:serif;font-size:160%'>Just row identifier for columns.</h4>


# 2. Type

In [None]:
analyze_discrete_feature(df_input.type)

# 3. Title

In [None]:
display(df_input.title)

<h4 style="background-color:#fbfbfb;font-family:serif;font-size:160%">
Title of the show. String discrete feature.</h3>

# 4. Director

In [None]:
print ("\n Sample data: \n")
display (df_input.director.dropna().head())
#Get the series of director names.
director_names=df_input.director.dropna().copy()

director_names=director_names.apply(lambda x: x.replace(" ",""))
director_names=director_names.value_counts()
#Create a dictionary with director names and count of video contents.
director_counts={}
for item in director_names.iteritems():
    director_counts[item[0]]=item[1]
    
wc=WordCloud(background_color=background_color).generate_from_frequencies(director_counts)

fig=plt.figure(figsize=(15,9),facecolor=background_color)
gs= fig.add_gridspec(1,1)
gs.update(wspace=0.5, hspace=0.5)
ax0=fig.add_subplot(gs[0,0])
ax0.set_facecolor(background_color)
ax0.imshow(wc,interpolation='bilinear')
ax0.axis('off')
#Draw a seperation line
l1=lines.Line2D([0.92,0.92],[0.1,0.9], transform=fig.transFigure,color='black',lw=0.2)
fig.lines.extend([l1])

fig.text(x=0.93,y=0.7,
         s="List of directors based on their frequency in the data",
         fontsize=25,fontfamily='serif',color='grey',fontweight='bold')
fig.text(x=0.92,
        y=0.5,
        fontweight='light',
        fontfamily='serif',
        fontsize=22,
        color='grey',
        s='''
        Topmost director Raul Campos, Jan Suter did 18 contents.
        There are 3312 directors with single content.
        ''')
fig.tight_layout(pad=0)
fig.patch.set_facecolor(background_color)
plt.show()
#Display director names and counts
#display(pd.DataFrame(director_names).rename(columns={'director':'Count'}))
#dir_count=pd.DataFrame(director_names).rename(columns={'director':'Count'})
#display (dir_count.value_counts())

# 5. Cast

In [None]:
print ("Sample data:\n")
display (df_input.cast.dropna().head())

In [None]:
actor_list=[]
cast=df_input.cast.dropna().copy()
#Convert to lower letters
cast=cast.apply(lambda x : x.lower())

#Create list of actors
for item in cast:
    actor_list.extend(item.split(','))
#remove duplicate actors    
actor_list=list(set(actor_list))
print (f"Number of actors: {len(actor_list)}")
#Create a dictionary for storing count
actor_dict={}
for actor in actor_list:
    actor_dict[actor]=0
#Iterate through series and find the count
for actor in cast:
    for x in actor.split(','):
        actor_dict[x]+=1
actors_count=pd.Series(actor_dict)
actors_count.sort_values(ascending=False,inplace=True)


In [None]:
wc=WordCloud(background_color=background_color).generate_from_frequencies(actor_dict)

fig=plt.figure(figsize=(15,9),facecolor=None)
gs= fig.add_gridspec(1,1)
gs.update(wspace=0.5, hspace=0.5)
ax0=fig.add_subplot(gs[0,0])
ax0.set_facecolor(background_color)
ax0.imshow(wc,interpolation='bilinear')
ax0.axis('off')
#Draw a seperation line
l1=lines.Line2D([0.92,0.92],[0.1,0.9], transform=fig.transFigure,color='black',lw=0.2)
fig.lines.extend([l1])

fig.text(x=1,y=0.7,
         s="Actors list",
         fontsize=25,fontfamily='serif',color='grey',fontweight='bold')
fig.text(x=0.92,
        y=0.4,
        fontweight='light',
        fontfamily='serif',
        fontsize=22,
        color='grey',
        s='''
        Anupam kher is the top most one who acted in 38 video contents,
        followed by Takahiro sakurai, Shah rukh khan, Om puri etc.\n
        There are 35364 actors in the dataset.
        ''')
fig.tight_layout(pad=0)
fig.patch.set_facecolor(background_color)
plt.show()
print ("\nTop most actors based on content count:\n")
display(actors_count.head())

# 6. Country

In [None]:
print ("Sample data:\n")
display(df_input.country.head(5))

In [None]:
country=df_input.country.dropna().copy()
country = country.apply(lambda x: x.lower().strip())

country_list=[]
for item in country:
    country_list.extend(item.split(','))
country_list=list(set(country_list))

#Dictionary for storing the count
country_dict={}
for ctry in country_list:
    country_dict[ctry]=0

#Iterate through the series for updating the count
for ctry in country:
    for x in ctry.split(','):
        country_dict[x]+=1
country_dict.pop('')

country_count=pd.Series(country_dict)
country_count.sort_values(ascending=False, inplace=True)

In [None]:
wc=WordCloud(background_color=background_color).generate_from_frequencies(country_dict)


fig=plt.figure(figsize=(15,9),facecolor=None)
gs= fig.add_gridspec(1,1)
gs.update(wspace=0.5, hspace=0.5)
ax0=fig.add_subplot(gs[0,0])
ax0.set_facecolor(background_color)
ax0.imshow(wc,interpolation='bilinear')
ax0.axis('off')
#Draw a seperation line
l1=lines.Line2D([0.92,0.92],[0.1,0.9], transform=fig.transFigure,color='black',lw=0.2)
fig.lines.extend([l1])

fig.text(x=0.93,y=0.7,
         s="List of Countries based on their frequency in the data",
         fontsize=25,fontfamily='serif',color='grey',fontweight='bold')
fig.text(x=0.92,
        y=0.5,
        fontweight='light',
        fontfamily='serif',
        fontsize=22,
        color='grey',
        s='''
        Topmost countries in which Netflix released it's contents
        are United states,India and United Kingdom.
        ''')
fig.tight_layout(pad=0)
fig.patch.set_facecolor(background_color)
plt.show()

print ("\nTop most countries based on count:\n")
display(country_count)
print (f"\nUnique country count: {len(country_dict)}")

# 7. Date_Added

In [None]:
print ("Sample data:\n", df_input.date_added.dropna().head())


In [None]:
date_added=df_input.date_added.dropna().copy()
#Convert date from string to DateTime format
date_added=date_added.apply(lambda x: datetime.datetime.strptime(x.strip(),"%B %d, %Y"))
df=pd.DataFrame(date_added)
#Get the years from date
df["YearAdded"]=df.date_added.apply(lambda x: x.year)
#Get the statistics for the year added
analyze_discrete_feature(df.YearAdded)

<h4 style="background-color:#fbfbfb;font-family:serif;font-size:160%;">
    Maximum of the contents are added in 2019. <br>
    88% percent of the contents are added during 2017 to 2020.<br>
    Data is available from 2008 to 2021.
    </h4>

# 8. Release Year

In [None]:
analyze_discrete_feature(df_input.release_year)

<h4 style="background-color:#fbfbfb;font-family:serif;font-size:160%;">
    Release_year is ranging from 1925 to 2021. <br>
    2018 has got the maximum contents released.<br>
    </h4>

# 9. Rating

In [None]:
analyze_discrete_feature(df_input.rating)

<h4 style="background-color:#fbfbfb;font-family:serif;font-size:160%;">
    14 types of ratings are available in the data. <br>
    TV-MA (Mature Audience Only) type of the rating is the max count of rating, which included 36.8% ratings in the data.
    </h4>

# 10. Duration

In [None]:
analyze_discrete_feature(df_input.duration,display_graph=False)

<h4 style="background-color:#fbfbfb;font-family:serif;font-size:160%;">
    There are 216 different types of duration types are available in the data.<br>
    '1 Season' duration got the maximum count in dataset.
    </h4>
    

# 11. Listed in

In [None]:
df_input.listed_in

In [None]:
listed_in=df_input.listed_in.copy()
listed_in=listed_in.apply(lambda x: x.strip().lower())
listed_in_list=[]
for item in listed_in:
    listed_in_list.extend(item.split(','))
listed_in_list=list(set(listed_in_list))
print (f"Number of listed_in :{len(listed_in_list)}")

listed_in_dict={}
for listed in listed_in_list:
    listed_in_dict[listed]=0
for item in listed_in:
    for x in item.split(','):
        listed_in_dict[x]+=1
listed_in_count=pd.Series(listed_in_dict)        
listed_in_count.sort_values(ascending=False,inplace=True)

#Now generate word cloud
wc=WordCloud(background_color=background_color).generate_from_frequencies(listed_in_dict)

fig=plt.figure(figsize=(15,9),facecolor=None)
gs= fig.add_gridspec(1,1)
gs.update(wspace=0.5, hspace=0.5)
ax0=fig.add_subplot(gs[0,0])
ax0.set_facecolor(background_color)
ax0.imshow(wc,interpolation='bilinear')
ax0.axis('off')
#Draw a seperation line
l1=lines.Line2D([0.92,0.92],[0.1,0.9], transform=fig.transFigure,color='black',lw=0.2)
fig.lines.extend([l1])

fig.text(x=0.93,y=0.7,
         s="Listed in Types",
         fontsize=20,fontfamily='serif',color='grey',fontweight='bold')
fig.text(x=0.92,
        y=0.5,
        fontweight='light',
        fontfamily='serif',
        fontsize=18,
        color='grey',
        s='''
        Topmost Listed In type is International movies.
        ''')
fig.tight_layout(pad=0)
fig.patch.set_facecolor(background_color)
plt.show()

# 12. Description

In [None]:
df_input.description

# <center> Feature Relationships</center>

# 1. Director and Type
#### 1a. Which director directed maximum movie contents in India?

In [None]:
df=df_input.loc[:,["director","country","type"]].copy()
df.dropna(subset=["director"],inplace=True)
# group the data by type which having two values - movie and TV Show
grouped=df.groupby(['type'])
# Let's find the movie group
df_type=grouped.get_group('Movie')
# Select for the country, for e.g India
df_country=df_type[df_type["country"]=="India"]
display (df_country.value_counts().head(1))

#### 1b. Which director directed maximum TV serials in India?

In [None]:
df_type=grouped.get_group('TV Show')
df_country=df_type[df_type["country"]=="India"]
display (df_country.value_counts().head(1))

<h4 style="background-color:#fbfbfb;font-family:serif;font-size:160%;">
    Director 'David Dhawan' is the one who directed 9 Movies in India.
    </h4>

# 2. Cast and Type

In [None]:
del(df,df_type)

In [None]:
#Create dataframe with cast,country and type
df=df_input.loc[:,["cast","country","type"]].copy()
df.dropna(subset=["cast","country"],inplace=True)

#Create new data frame with case,country and type for processing purpose
df1 = pd.DataFrame(columns=["cast","country","type"])

# Insert values in the new data frame, with individual cast names in cast columns
# so that every row contains one cast per country.
for rec in tqdm(df.iterrows()):
    cast_names =rec[1]["cast"]
    for name in cast_names.split(','):
        df1=pd.concat([df1,
                      pd.DataFrame(data=
                        {"cast":name,
                         "country":rec[1]["country"],
                         "type":rec[1]["type"]},
                        index=[0])])
df1.reset_index(drop=True,inplace=True)

# Filter the values based on country name, e.g India
df_cast=df1[df1["country"]=="India"]
# create groups based on type
grouped=df_cast.groupby("type")
# Lets find data based on Movie type
df_type=grouped.get_group('Movie')
print ("\nCast names based on Movies:\n")
display(df_type.value_counts().head(3))
#Let's find data based in type TV show
df_type=grouped.get_group('TV Show')
print ("\nCast based on TV shows:\n")
display(df_type.value_counts().head(3))
del(df,df1,df_cast,df_type,grouped)

<h4 style="background-color:#fbfbfb;font-family:serif;font-size:160%;">
    In India: <br>
    Anupam kher acted maximum of 35 movies.<br>
    Rajesh Kava and Nishka Raheja acted in maximum (3) TV shows.
    </h4>

# 3. Type and Country

In [None]:
df = df_input.loc[:,["type","country"]].copy()
df.dropna(inplace=True)
df["country"]=df["country"].apply(lambda x: x.lower().strip())

#New dataframe for processing purpose
df1 = pd.DataFrame(columns=["type","country"])

for rec in tqdm(df.iterrows()):
    country_names = rec[1]["country"]
    type_name=rec[1]["type"]
    for name in country_names.split(','):
        df1=pd.concat([df1,
                      pd.DataFrame(data={"type":type_name,
                                        "country":name},
                                  index=[0])])
df1.reset_index(drop=True,inplace=True);

#get top ten counry names from earlier country count
top_ten_countries=country_count.index.tolist()[:10]

#Filtering data only with top 10 countries
df2=df1[df1["country"].isin (top_ten_countries) ].copy()

In [None]:
fig=plt.figure(figsize=(14,7));

ax0=fig.add_subplot(1,2,1)
ax1=fig.add_subplot(1,2,2)
ax1.grid(False)
ax1.set_xticklabels([])
ax1.set_yticklabels([])

fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)

#Plot the countplot
sns.countplot(data=df2,x="country",hue="type",ax=ax0);

#rotating the ticklabels in x axis
for tick in ax0.get_xticklabels():
    tick.set_rotation(90)
    

#Draw line in the middle    
l1= lines.Line2D([0.5,0.5],[0.1, 0.9],color='black',lw=0.2,transform=fig.transFigure)
fig.lines.extend([l1])

#text content
fig.text(x=0.47,
        y=0.5,
        fontweight='bold',
        fontfamily='serif',
        fontsize=17,
        color='grey',
        s='''
        Country based listing  
        for the count of Movie and TV shows.
        ''')
plt.show()

In [None]:
del(df1,df2,df,df_country)

# 4. Director and Cast

In [None]:
df=df_input.loc[:,["director","cast"]].copy()
df.dropna(axis=0,inplace=True);
df["cast"]=df["cast"].apply(lambda x: x.lower().strip())
df1=pd.DataFrame(columns=["director","cast"])

In [None]:
#Create a new data frame which contains unique director and unique actor per row.
for rec in tqdm(df.iterrows()):
    director=rec[1]["director"]
    cast_list=rec[1]["cast"]
    for individual_cast in cast_list.split(','):
        df1=pd.concat([df1,
                     pd.DataFrame(data={'director':director,
                                       'cast':individual_cast},
                                 index=[0])])
df1.reset_index(drop=True, inplace=True)
#View the statistics    
df2=df1.value_counts().to_frame()
df2.reset_index(inplace=True)
display(df2.head(15))

In [None]:
#Incase we need to double check the actual data

#
#with option_context('display.max_colwidth',400):
#    display(df_input[df_input['director']=='Cathy Garcia-Molina'].loc[:,['type','director','cast','title']])

In [None]:
del(df,df1)

<h4 style="background-color:#fbfbfb;font-family:serif;font-size:160%;">
    From the dataset, there are maximum of 6 movies where directors would like to work with specific actor/actress.
    </h4>

# 5. Cast and Rating

In [None]:
df = df_input.loc[:,['cast','rating']].copy()
df.dropna(axis=0,inplace=True)
df["cast"]=df["cast"].apply(lambda x: x.lower().strip())
#New dataframe for processing purpose
df1=pd.DataFrame(columns=["rating","cast"])
#Insert a new data frame which contains unique director and unique actor per row.
for rec in tqdm(df.iterrows()):
    rating=rec[1]["rating"]
    cast_list=rec[1]["cast"]
    for individual_cast in cast_list.split(','):
        df1=pd.concat([df1,
        pd.DataFrame(data={'rating':rating,
                           'cast':individual_cast},
                           index=[0])])
df1.reset_index(drop=True, inplace=True)
#New data frame stores the value counts
df2=df1.value_counts().to_frame()
df2.reset_index(inplace=True)
df2.rename(columns={0:'counts'},inplace=True)


In [None]:
#Find which actor acted maximum videos with rating TV-MA
display(df2[df2["rating"]=="TV-MA"].head(5))

In [None]:
del(df,df1,df2)

<h4 style="background-color:#fbfbfb;font-family:serif;font-size:160%;">
Actor Takahiro Sakurai is the person who acted in highest numbers of 'TV-MA' rated contents.
    </h4>

# 6. Release year, Year added

In [None]:
df=df_input.loc[:,['country','release_year','date_added']].copy()
df.dropna(axis=0,inplace=True)
df['added_date']=df['date_added'].apply(lambda x: datetime.datetime.strptime(x.strip(), "%B %d, %Y"))
df['added_year']=df['added_date'].apply(lambda x: x.year)
df['year_difference'] =  df['added_year']-df['release_year']
df.drop(columns=['date_added','added_date','added_year','release_year'],inplace=True)

df1 = pd.DataFrame(columns=["country","year_difference"])

#Insert a new data frame which contains unique director and unique actor per row.
for rec in tqdm(df.iterrows()):
    country_list = rec[1]["country"]
    for country_name in country_list.split(','):
        df1 = pd.concat([df1,
                        pd.DataFrame(data={'country': country_name,
                                          'year_difference': rec[1]['year_difference']},
                                    index=[0])])
# Make sure that year_difference column is numeric so that we can process aggregations
df1['year_difference']=pd.to_numeric(df1['year_difference'])


In [None]:
# df1=df1.groupby(['country'],as_index=False).agg({'year_difference':['mean']})   #for checking only mean
df2=df1.groupby(['country'],as_index=False).agg({"year_difference":[min,max,'mean']})
display(df2.sort_values((('year_difference', 'mean')),ascending=False))

In [None]:
del(df,df1,df2)

<h4 style="background-color:#fbfbfb;font-family:serif;font-size:160%;">
    In the country Liechtenstein, there are video contents which added in netflix after 54 years, and that's the highest difference between release and added year.
    </h4>

# 7. Release count per month

In [None]:
df=df_input.loc[:,['date_added','country']].copy()
df.dropna(axis=0,inplace=True)
#Convert date from string format to datetime format
df['added_date']=df['date_added'].apply(lambda x: datetime.datetime.strptime(x.strip(), "%B %d, %Y"))
#Extract year and month
df['year_added'] = df['added_date'].apply(lambda x: x.year)
df['month_added'] = df['added_date'].apply(lambda x: x.month)
df.drop(columns=['date_added','added_date'],inplace=True)

In [None]:
#Create new dataframe and copy the structure
df1=pd.DataFrame(columns=df.columns)

In [None]:
#Get individual country names in each rows
for rec in tqdm(df.iterrows()):
    country_list = rec[1]['country']
    for country_name in country_list.split(','):
        df1=pd.concat([df1,
                      pd.DataFrame(data={'country':country_name,
                                        'year_added':rec[1].year_added,
                                        'month_added': rec[1].month_added},
                                  index=[0])])
#df1.month_added=df1.month_added.apply(lambda x: calendar.month_name[x])
df1.reset_index(inplace=True,drop=True)                

In [None]:
#Group based on country
groupby=df1.groupby('country')
#Check for movie statistics for a specific country. eg: India
df_india=groupby.get_group('India')

In [None]:
#Aggregation process
df_india=df_india.groupby(['year_added','month_added']).agg({'month_added':'count'})
df_india.rename(columns={'month_added':'VideoCount'},inplace=True)
df_india.reset_index(inplace=True)
print ('Sample dataframe after aggregating the video added count based on year and month')
display (df_india.head())

In [None]:
#Visualize data
fig=plt.figure(figsize=(17,7));

ax0=fig.add_subplot(1,2,1)
ax1=fig.add_subplot(1,2,2)
ax1.grid(False)
ax1.set_xticklabels([])
ax1.set_yticklabels([])

fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)

#Plot the countplot
sns.lineplot(data=df_india,x="month_added",y="VideoCount",hue="year_added",ax=ax0,palette='tab10').set_title('Video count per month in India')
ax0.legend(loc='center', bbox_to_anchor=(1.1, 0.5))
ax0.set_xticks(np.arange(1,13))
ax0.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])


#rotating the ticklabels in x axis
for tick in ax0.get_xticklabels():
    tick.set_rotation(90)
    

#Draw line in the middle    
l1= lines.Line2D([0.55,0.55],[0.1, 0.9],color='black',lw=0.2,transform=fig.transFigure)
fig.lines.extend([l1])

#text content
fig.text(x=0.55,
        y=0.5,
        fontweight='light',
        fontfamily='serif',
        fontsize=17,
        color='grey',
        s='''
        2016 having very less number of videos added.
        Overall more videos added in 2018.
        2019 December having highest number of videos added.
        ''')
plt.show()

In [None]:
del([df,df_india,df1])