# **Speaker Insights Using TED Dataset**

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from plotly.subplots import make_subplots
import datetime as dt

### **1) Loading Dataset**

In [32]:
df = pd.read_csv('ted_data.csv')
df.sample(5)

Unnamed: 0,title,author,date,views,likes,link
781,The greatest mathematician that never lived,Pratik Aghor,July 2020,2300000,70000,https://ted.com/talks/pratik_aghor_the_greates...
1207,Why archaeology needs to transcend borders,Ella Al-Shamahi,April 2016,37000,1100,https://ted.com/talks/ella_al_shamahi_why_arch...
1375,A vision for the future of Sierra Leone,Julius Maada Bio,April 2019,1800000,55000,https://ted.com/talks/julius_maada_bio_a_visio...
146,Are locust plagues unstoppable?,Jeffrey A. Lockwood,November 2021,475000,14000,https://ted.com/talks/jeffrey_a_lockwood_are_l...
1218,How motivation can fix public systems,Abhishek Gopalka,September 2019,2000000,61000,https://ted.com/talks/abhishek_gopalka_how_mot...


### **2) Basic Information about data**

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5440 entries, 0 to 5439
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5440 non-null   object
 1   author  5439 non-null   object
 2   date    5440 non-null   object
 3   views   5440 non-null   int64 
 4   likes   5440 non-null   int64 
 5   link    5440 non-null   object
dtypes: int64(2), object(4)
memory usage: 255.1+ KB


In [34]:
df.describe(exclude=['object']) # Exclude columns with 'object' dtype

Unnamed: 0,views,likes
count,5440.0,5440.0
mean,2061576.0,62607.62
std,3567098.0,107646.8
min,532.0,15.0
25%,670750.0,20000.0
50%,1300000.0,40500.0
75%,2100000.0,65000.0
max,72000000.0,2100000.0


In [35]:
df.isna().sum() #checking null values

title     0
author    1
date      0
views     0
likes     0
link      0
dtype: int64

In [36]:
df.duplicated().sum() #checking dublicate values

0

## **3) Exploratory Data Analysis**

#### **3.1) Finding the most popular TED talks :**

In [37]:
#adding views and likes to get popularity score
df['popularity_score'] = df['views'] + df['likes']

# Sort by popularity score and get the top 10
popular_ted = df[['title', 'popularity_score']].sort_values(by='popularity_score',
                                                            ascending=False).head(10).style.background_gradient(cmap='PuBuGn')
popular_ted


Unnamed: 0,title,popularity_score
5436,Do schools kill creativity?,74100000
4084,Your body language may shape who you are,65900000
2958,Inside the mind of a master procrastinator,61800000
4765,How great leaders inspire action,58700000
4605,The power of vulnerability,57700000
3504,How to speak so that people want to listen,50400000
2168,My philosophy for a happy life,44300000
3251,The next outbreak? We're not ready,44300000
3017,What makes a good life? Lessons from the longest study on happiness,42200000
3994,"Looks aren't everything. Believe me, I'm a model.",39100000


In [38]:
# Calculate a popularity score based on views and likes
df['popularity_score'] = df['views'] + df['likes']

# Sort by popularity score and get the top 10
popular_ted = df[['title', 'popularity_score']].sort_values(by='popularity_score', ascending=False).head(10)

# Visualization
fig = px.bar(x=popular_ted['popularity_score'], y=popular_ted["title"])
fig.update_layout(title_text='Top 10 TED Talks based on Popularity Score', xaxis_title="Popularity Score", yaxis_title="Title")
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)
fig.show()


#### **3.2) Finding the most popular TED talks Speaker (in terms of number of talks)**

In [39]:
#top 10 Popular speaker In terms of likes
popular_speaker= df[['author','likes']].sort_values(by='likes', ascending=False).head(10).style.background_gradient(cmap='PuBuGn')
popular_speaker

Unnamed: 0,author,likes
5436,Sir Ken Robinson,2100000
4084,Amy Cuddy,1900000
2958,Tim Urban,1800000
4605,Brené Brown,1700000
4765,Simon Sinek,1700000
3504,Julian Treasure,1400000
2168,Sam Berns,1300000
3251,Bill Gates,1300000
3017,Robert Waldinger,1200000
4990,Mary Roach,1100000


In [40]:
# Creating the dataframe
popular_speaker = df[['author','likes']].sort_values(by='likes', ascending=False).head(10)

# Visualization
fig = px.histogram(popular_speaker, x='likes', y='author',
                   color='author',
                   color_discrete_sequence=px.colors.qualitative.Set3)  # Use of predefined color sequence
fig.update_layout(title_text='Top 10 TED Talks based on number of views',
                  xaxis_title="Views", yaxis_title="Title")
fig.update_traces(marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)
fig.show()


#### **3.3) Month-wise Analysis of TED talk frequency**

In [41]:
# Creating new columns
df['Published_year'] = pd.to_datetime(df['date']).dt.year            #year
df['Published_month'] = pd.to_datetime(df['date']).dt.month          #month name
df['Published_day'] = pd.to_datetime(df['date']).dt.day              #day

In [42]:
#dropping date column as we have extrated year , month and day and created new column
df.drop('date' , inplace=True , axis=1)

In [43]:
# getting number of titles in each month
month_wise = pd.DataFrame(data=df.groupby(['Published_month'])['title'].count()).reset_index()
month_wise

Unnamed: 0,Published_month,title
0,1,147
1,2,725
2,3,580
3,4,576
4,5,322
5,6,493
6,7,446
7,8,200
8,9,349
9,10,585


In [44]:
fig = px.line(month_wise , x='Published_month' , y='title' , title='Month-wise Analysis of TED talk frequency')
fig.update_layout(xaxis_title='Month', yaxis_title='Number of Talks')
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6 )
fig.show()

#### **3.4) Year-wise Analysis of TED talk frequency**

In [45]:
# getting the count of titles in each year and creating a dataFrame
year_wise = pd.DataFrame(data=df.groupby(['Published_year'] )['title'].count()).reset_index()
year_wise.head()

Unnamed: 0,Published_year,title
0,1970,2
1,1972,1
2,1983,1
3,1984,1
4,1990,1


In [46]:
fig = px.line(year_wise , x='Published_year' , y='title' , title='Year-wise Analysis of TED talk frequency')
fig.update_layout(xaxis_title='Year', yaxis_title='Number of Talks')
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6 )
fig.show()

#### **3.5) Finding TED talks of your favorite Author**

In [47]:
# function to find your favorite talk
def fav_author(df,author):

  """
  df = provided dataset
  author = provide your fav author
  """

  df = df[df['author'] == author]

  if not df.empty:
    return df[['title' ,'author' ]]
  else:
    return 'No Author found'

In [48]:
# input your favorite author
author = input('Enter your favorite author')
#function
fav_author(df , author)

Unnamed: 0,title,author
403,The innovations we need to avoid a climate dis...,Bill Gates
789,How the pandemic will shape the near future,Bill Gates
952,How we must respond to the coronavirus pandemic,Bill Gates
3251,The next outbreak? We're not ready,Bill Gates
3878,Teachers need real feedback,Bill Gates
4560,How state budgets are breaking US schools,Bill Gates
4824,Innovating to zero!,Bill Gates
5062,"Mosquitos, malaria and education",Bill Gates


#### **3.6) Finding TED talks with the best view to like ratio**

In [49]:
## Finding TED talks with the best view to like ratio
ratio = df['views'] / df['likes']  #ratio

In [50]:
#plotting scatter plot views vs like
fig = px.scatter(df , x='views' , y='likes' , color=ratio , title='Views vs. Likes with Ratio as Color',
                 size='views', size_max=50)
fig.update_layout(xaxis_title='Views', yaxis_title='Likes')
fig.show()

#### **3.7) Finding TED talks based on tags (like climate)**

In [51]:
## creating a function
def tags(tag):
  # Convert the 'title' column to lowercase for case-insensitive matching
  # converting the tags to lower case for case-insensitive matching
  tag = tag.lower()
  # Filter the DataFrame based on the tag
  filtered_df = df[df['title'].str.lower().str.contains(tag)]

  if not filtered_df.empty:
    return filtered_df
  else:
    return 'No talk found'

In [52]:
tag = input('Enter tour favorite tag : ')
tags(tag)

Unnamed: 0,title,author,views,likes,link,popularity_score,Published_year,Published_month,Published_day
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,404000,12000,https://ted.com/talks/ozawa_bineshi_albert_cli...,416000,2021,12,1
3,Why is China appointing judges to combat clima...,James K. Thornton,427000,12000,https://ted.com/talks/james_k_thornton_why_is_...,439000,2021,10,1
17,The ocean's ingenious climate solutions,Susan Ruffo,522000,15000,https://ted.com/talks/susan_ruffo_the_ocean_s_...,537000,2021,10,1
18,How we're reducing the climate impact of elect...,Tim Dunn,96000,2900,https://ted.com/talks/tim_dunn_how_we_re_reduc...,98900,2022,2,1
34,How to boost nature-based solutions to climate...,Zac Goldsmith,30000,913,https://ted.com/talks/zac_goldsmith_how_to_boo...,30913,2021,10,1
...,...,...,...,...,...,...,...,...,...
4892,The science behind a climate headline,Rachel Pike,1000000,31000,https://ted.com/talks/rachel_pike_the_science_...,1031000,2009,7,1
5226,New thinking on the climate crisis,Al Gore,2200000,68000,https://ted.com/talks/al_gore_new_thinking_on_...,2268000,2008,3,1
5280,A critical look at geoengineering against clim...,David Keith,1400000,44000,https://ted.com/talks/david_keith_a_critical_l...,1444000,2007,9,1
5389,Global priorities bigger than climate change,Bjorn Lomborg,1700000,53000,https://ted.com/talks/bjorn_lomborg_global_pri...,1753000,2005,2,1


#### **3.8) Finding the most popular TED talks Speaker (in terms of number of views) :**

In [53]:
author_views = df.groupby(df['author'])['views'].sum().reset_index()
author_views.sort_values(by='views' , ascending=False).head(20).style.background_gradient(cmap='PuBuGn')

Unnamed: 0,author,views
148,Alex Gendler,187196000
3902,Sir Ken Robinson,95654000
544,Bill Gates,77800000
3895,Simon Sinek,74800000
594,Brené Brown,72000000
2240,Julian Treasure,64300000
244,Amy Cuddy,64000000
4184,Tim Urban,60000000
1781,Iseult Gillespie,54998000
2914,Mia Nacamulli,44174000


In [54]:
# getting only top 20 speakers
author_views = author_views.sort_values(by='views', ascending=False).head(15)

# plotting bar-chart

fig = px.bar(author_views , x='author' , y='views' , title='Top 20 TED talks based on number of views',
             color="author",
             color_discrete_sequence=px.colors.qualitative.Set2)
fig.update_layout(xaxis_title='Author' , yaxis_title='Views')
fig.update_traces(marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)
fig.show()