### Import library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Read CSV file

In [None]:
ted=pd.read_csv('https://raw.githubusercontent.com/sayedul79/python-data-science-code/main/data/ted_main.csv')

In [None]:
ted.head()

In [None]:
ted.shape

In [None]:
ted.columns

### Check null values in DataFrame

In [None]:
ted.isnull().sum()

### `dtype` in DataFrame

In [None]:
ted.dtypes

In [None]:
ted.info()

### Fill the null values

In [None]:
ted.speaker_occupation.mode()

In [None]:
ted.speaker_occupation.mode()[0]

In [None]:
ted['speaker_occupation'] = ted.speaker_occupation.fillna(
    ted.speaker_occupation.mode()[0])

In [None]:
ted.isnull().sum()

So there is no missing value

### Descriptive analysis

In [None]:
ted.describe()

# Exploratory data analysis

In [None]:
ted.corr()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(ted.corr(), annot=True)

From the heatmap, the number of views correlates well with language and comments.

In [None]:
ted.hist(color='#f78f07',edgecolor="white", linewidth=1.2, 
         figsize=(15, 10));

In [None]:
ted.main_speaker.value_counts()

In [None]:
plt.figure(figsize=(12, 8))
ted.languages.value_counts().hist(bins=20, color='#de009f',
                                  edgecolor='white')

### Most commented TED talk

In [None]:
max_comments=ted.sort_values('comments')[['views','title',
                                          'comments']].tail(10)
max_comments

In [None]:
fig, ax=plt.subplots(figsize=(15,10))
ax.barh(max_comments.title, max_comments.comments, color='#d65d0d')
ax.set_title('Most commented topics', fontdict={'fontsize':20,
                                               'fontweight':'medium'})

ax.spines['bottom'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.xaxis.set_ticks_position('top')
ax.spines['left'].set_color('blue')
ax.spines['top'].set_color('blue')

In [None]:
# make this more interpretable by inverting the calculation
ted['views_per_comment'] = ted.views / ted.comments
normalise_view=ted.sort_values('views_per_comment')[
    ['views_per_comment', 'title']].tail(10)
normalise_view

In [None]:
fig, ax=plt.subplots(figsize=(15,10))
ax.barh(normalise_view.title, normalise_view.views_per_comment,
        color='#FFCB0B')
ax.set_title('Views per comment', fontdict={'fontsize':50,
                            'fontweight':'medium'})

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_color('blue')
ax.spines['bottom'].set_color('blue')
ax.spines['left'].set_linewidth(1.5)
ax.spines['bottom'].set_linewidth(1.5)

In [None]:
fig, ax=plt.subplots(figsize=(15,10))
ax.plot(list(ted.index), list(ted.views))

### Views per year

In [None]:
ted.published_date.head()

In [None]:
# results don't look right
pd.to_datetime(ted.published_date).head()

In [None]:
# now the results look right
pd.to_datetime(ted.published_date, unit='s').head()

In [None]:
ted['published_date']=pd.to_datetime(ted.published_date, unit='s')
ted.loc[:4, ['num_speaker', 'published_date',
       'ratings']]

In [None]:
fig, ax=plt.subplots(figsize=(12,8))
year_view=ted.groupby(ted.published_date.dt.strftime('%Y'))[
    'views'].sum().sort_values()
ax.barh(year_view.index, year_view, color='purple')
ax.set_title('Views per year', fontdict={'fontsize':20,
                                    'fontweight':'medium'})

In [None]:
fig, ax=plt.subplots(figsize=(12,8))
ax.hist(ted.comments)

In [None]:
(ted.comments>1000).sum()
#check how many value has greater than 1000

In [None]:
fig,ax=plt.subplots(nrows=2, ncols=2, figsize=(12,8))
ax[0,0].hist(ted.comments,edgecolor='white', color='#8B072F')

ax[0,1].hist(ted[ted.comments<1000].comments, 
            edgecolor='white', color='#8B072F')

ax[1,0].hist(ted[ted.comments<1000].comments, bins=20, 
             edgecolor='white', color='#8B072F')

ax[1,1].boxplot(ted[ted.comments<1000].comments)

### Plot the number of talks that took place each year

In [None]:
# dataset documentation for film_date says "Unix timestamp of the filming"
ted.film_date.head()

In [None]:
# results don't look right
pd.to_datetime(ted.film_date).head()

In [None]:
# now the results look right
pd.to_datetime(ted.film_date, unit='s').head()

In [None]:
ted['film_datetime'] = pd.to_datetime(ted.film_date, unit='s')

# verify that event name matches film_datetime for a random sample
ted[['event', 'film_datetime']].sample(5)

In [None]:
ted.dtypes

In [None]:
# datetime columns have convenient attributes under the dt namespace
ted.film_datetime.dt.year.head()

In [None]:
ted.film_datetime.dt.strftime('%Y').value_counts().sort_index()

In [None]:
fig, ax=plt.subplots(figsize=(12,8))
event_per_year=ted.film_datetime.dt.strftime('%Y').value_counts().sort_index()
ax.bar(event_per_year.index, event_per_year, color='#00274C')
ax.set_xticklabels(event_per_year.index, rotation=45)

In [None]:
fig, ax=plt.subplots(figsize=(12,8))
event_per_year=ted.film_datetime.dt.strftime('%Y').value_counts().sort_index()
ax.plot(event_per_year.index, event_per_year, color='red', 
       linewidth=1.5)
ax.set_xticklabels(event_per_year.index, rotation=45)
ax.set_xlim(0)

In [None]:
# we only have partial data for 2017
ted.film_datetime.max()