# Detailed EDA to generate insights about the CTDS show spanning for 1 Year.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
episode=pd.read_csv("/kaggle/input/chai-time-data-science/Episodes.csv")
episode.head()

In [None]:
desc=pd.read_csv("/kaggle/input/chai-time-data-science/Description.csv")
desc.head()

In [None]:
episode.dtypes

In [None]:
episode['release_date']=pd.to_datetime(episode['release_date'])
episode['recording_date']=pd.to_datetime(episode['recording_date'])
episode['release_date']=episode['release_date'].dt.date
episode['recording_date']=episode['recording_date'].dt.date

In [None]:
episode.isna().sum()

In [None]:
episode.iloc[2]

In [None]:
fig = px.pie(episode, names='heroes_gender', title='Speaker Gender')
fig.show()

In [None]:
fig = px.pie(episode, names='flavour_of_tea', title='Preferred Tea')
fig.show()

In [None]:

fig = go.Figure(data=[go.Pie(labels=episode['heroes_location'], hole=.5)])
fig.update_layout(title="Location of Speakers")
fig.show()

In [None]:
fig = go.Figure(data=[go.Pie(labels=episode['heroes_nationality'], hole=.5)])
fig.update_layout(title="Nationality of Speakers")
fig.show()

In [None]:
fig = go.Figure(data=[go.Pie(labels=episode['recording_time'],  pull=[0, 0, 0.2, 0])])
fig.update_layout(title="Recording Time")
fig.show()

In [None]:
fig = px.bar(episode, x='heroes_gender', title="Gender of Speakers")
fig.show()

In [None]:
fig = px.bar(episode, x='category', title="Industry Category of Speakers")
fig.show()

# What is Average Watch Duraion?
Average watch duration is the total watch time of your video divided by the total number of video plays, including replays. This metric measures your video’s ability to engage viewers. If your video can’t engage viewers, they’ll bounce from your video quickly, leaving you with an unimpressive average view duration. But if your video can engage viewers, your average view duration and total watch time will increase at the same time, boosting your search and recommendations rankings. You can also find this metric in your watch time report. (source: https://blog.hubspot.com/marketing/youtube-analytics)

In [None]:
topavgwatch=episode.nlargest(20,['youtube_avg_watch_duration'])


In [None]:
fig = px.bar(topavgwatch, x='heroes', y='youtube_avg_watch_duration',
             hover_data=['episode_duration', 'youtube_impression_views'], color='category',
             height=400, title="Top 20 speakers with best youtube_avg_watch_duration time")
fig.show()

In [None]:
#top 10 speakers based on apple listeners
apple=episode.nlargest(10,['apple_listeners'])


In [None]:

fig = px.line(apple, x='spotify_listeners', y='apple_listeners',color='category',title='Compare Spotify Vs Apple listeners')
fig.show()

In [None]:
largestviews=episode.nlargest(20,['youtube_views'])

In [None]:
fig = px.scatter(largestviews, x="youtube_subscribers", y="youtube_views", color="heroes",
                 size='youtube_avg_watch_duration', hover_data=['youtube_ctr'], title="Top 20 Speakers based on YouTube views")
fig.show()

# Impressions Click-Through Rate
Impressions click-through rate measures your video’s ability to prompt people to watch your video after seeing it on their homepage, recommendation section, or trending section. A high click-through rate means your title was compelling and your video’s topic resonates with a lot of relevant audiences on YouTube.

In [None]:
largestctr=episode.nlargest(20,['youtube_ctr'])

In [None]:
fig = px.bar(largestctr, y='youtube_ctr', x='heroes', text='youtube_likes', title="Top 20 Speakers based on YouTube CTR")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:

fig = px.bar(episode, x="apple_avg_listen_duration", y="heroes", orientation='h', title='Apple avg listen duration for Speakers')

fig.show()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Apple Listeners', x=episode['heroes'], y=episode['apple_listeners']),
    go.Bar(name='Spotify Listeners', x=episode['heroes'], y=episode['spotify_listeners']),
    
])

fig.update_layout(barmode='stack', title="Comparison of Apple and Spotify Listeners for Speakers",xaxis_tickangle=45)
fig.show()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Likes', x=largestviews['heroes'], y=largestviews['youtube_likes']),
    go.Bar(name='Dislikes', x=largestviews['heroes'], y=largestviews['youtube_dislikes']),
    go.Bar(name='Comments', x=largestviews['heroes'], y=largestviews['youtube_comments']),
    go.Bar(name='Subscribers', x=largestviews['heroes'], y=largestviews['youtube_subscribers'])
])

fig.update_layout(barmode='group', title="Comparison of YouTube metrics for Top Speakers based on YouTube Views")
fig.show()

In [None]:
fig = px.sunburst(largestviews, path=['heroes_location', 'episode_duration', 'youtube_watch_hours'], values='youtube_views', color='heroes',title='Hierarchy of Video Duration')
fig.show()

In [None]:
fig = px.scatter(largestviews, x="episode_duration", y="youtube_avg_watch_duration",
        size="youtube_views", color="heroes",
                 hover_name="category", log_x=True, size_max=60, title="Compare Episode duration and YouTube avg watch duration")
fig.show()

In [None]:
largest5=largestviews.nlargest(5,['youtube_views'])
largest5

# WordCloud to understand the most common words used during the interview by the top speakers based on YouTube views.

In [None]:
#WordCloud for Jeremy
epi27=pd.read_csv("/kaggle/input/chai-time-data-science/Cleaned Subtitles/E27.csv")
text27=epi27['Text']
text27 = " ".join(des for des in epi27.Text)
print ("There are {} words in the combination of all review.".format(len(text27)))
stopwords = set(STOPWORDS)
stopwords.update(["you", "know", "so", "think", "to",'of','yeah','want','people','first','And','okay','really','I'])

wordcloud = WordCloud(max_font_size=50, max_words=200, background_color="white",stopwords = stopwords).generate(text27)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#WordCloud for Parul
epi58=pd.read_csv("/kaggle/input/chai-time-data-science/Cleaned Subtitles/E58.csv")
text58=epi58['Text']
text58 = " ".join(des for des in epi58.Text)
print ("There are {} words in the combination of all review.".format(len(text58)))
stopwords = set(STOPWORDS)
stopwords.update(["you", "know", "so", "think", "to",'of','yeah','want','people','first','And','okay','really','I','lot','work','going'])

wordcloud = WordCloud(max_font_size=50, max_words=200, background_color="white",stopwords = stopwords).generate(text58)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#WordCloud for Abhishek
epi1=pd.read_csv("/kaggle/input/chai-time-data-science/Cleaned Subtitles/E1.csv")
text1=epi1['Text']
text1 = " ".join(des for des in epi1.Text)
print ("There are {} words in the combination of all review.".format(len(text1)))
stopwords = set(STOPWORDS)
stopwords.update(["you", "know", "So", "think", "to",'of','Yeah','want','people','first','And','okay','really','I'])

wordcloud = WordCloud(max_font_size=50, max_words=200, background_color="white",stopwords = stopwords).generate(text1)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:

fig = px.line(largestviews, x='recording_date', y='heroes',color='recording_time',title="Recording Date vs Recording Time")
fig.show()

In [None]:
fig = px.line(largestviews, x='release_date', y='heroes',color='heroes_location', title="Location vs Release Date")
fig.show()

# Observed Insights
* Males dominated the show.
* Speakers were maximum from USA. However, native US speakers were less. Many of them seemed to have moved to US.
* Masala and Ginger Chai were the most preferred while Kashmiri Khawa and Tulsi were the least preferred. Speakers seem to select tea for taste over health benefit.
* Maximum videos were recorded at night and least in the afternoon. Speakers seem to be free at night for the recording as they would be busy with their jobs during daytime.
* Most speakers were from Industry category followed by Kaggle. Least were from Research.
* For speakers number of Spotify and Apple listeners were mostly inversely propotional.
* Most top speakers based on YouTube views had top YouTube Likes, Subscribers and Comments
* Top speakers based on YouTube CTR did not have proportional YouTube Likes.
* Top speakers based on average YouTube watch duration did not match the top speakers based on YouTube views.