# Analysis, Recommendation and Text Generation for TED talks using NLP
## 1. Pre-Processing

## Importing relevant libraries

In [32]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.style as style
from sklearn.feature_extraction import text

import warnings
warnings.filterwarnings("ignore")
style.use('dark_background')

## Reading the data file from csv

In [5]:
main_df = pd.read_csv('TED-data/ted_main.csv')
trans_df = pd.read_csv('TED-data/transcripts.csv)

In [7]:
for i in range(len(main_df.columns)):
    i+=1

print("Total Columns : ", i)

main_df.columns

Total Columns :  17


Index(['comments', 'description', 'duration', 'event', 'film_date',
       'languages', 'main_speaker', 'name', 'num_speaker', 'published_date',
       'ratings', 'related_talks', 'speaker_occupation', 'tags', 'title',
       'url', 'views'],
      dtype='object')

### There are 17 feature columns in the main dataset

### About the features:
* __comments__: The Number of comments of the talk
* __description__: A summary of what the talk was about
* __duration__: The Duration of the talk in seconds
* __event__: The Event where the talk took place
* __film_date__: The Date on which the talk was filmed/recorded.
* __languages__: The number of languages in which the TED talk is available.
* __main_speaker__: The main speaker of the talk
* __name__: Includes Title and name of the speaker.
* __num_speaker__: Number of speakers in the talk
* __published_date__: Date when the TED Talk was published
* __ratings__: A Dictionary of various ratings given like Courageous, Inspiring, etc
* __related_talks__: A list of talks recomended to watch next
* __speaker_occupation__: Occupation of the speaker
* __tags__: Different themes related to talk
* __title__: Title of the TED Talk
* __url__: The URL of the TED Talk
* __views__:Number of views on the TED Talk

In [8]:
main_df.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


### Checking for missing or null values in dataset

In [24]:
main_df.isnull().sum()

name                  0
title                 0
description           0
main_speaker          0
speaker_occupation    0
num_speaker           0
duration              0
duration_hr           0
event                 0
film_date             0
published_date        0
comments              0
tags                  0
languages             0
ratings               0
related_talks         0
url                   0
views                 0
dtype: int64

Only "speaker_occupation" column has missing values in 6 rows. In the general case if the missing values type is numeric we fill it with mean values. If its an object we fill missing values with mode.

In [23]:
main_df['speaker_occupation'] = main_df.speaker_occupation.fillna(main_df.speaker_occupation.mode()[0])

### Converting "film_date" and "published_date" columns into D-M-Y format using datetime library 

In [13]:
main_df['film_date'] = main_df['film_date'].apply(lambda x: dt.datetime.fromtimestamp( int(x)).strftime('%d-%m-%Y'))
main_df['published_date'] = main_df['published_date'].apply(lambda x: dt.datetime.fromtimestamp( int(x)).strftime('%d-%m-%Y'))

In [26]:
# verify that event name matches film_date
main_df[['event', 'film_date']].sample(5)

Unnamed: 0,event,film_date
75,TED2002,01-02-2002
1991,TEDWomen 2015,28-05-2015
956,TEDxNorthwesternU,14-12-2010
311,TED2008,01-02-2008
865,TEDWomen 2010,07-12-2010


### Converting "duration" into hours and storing in new column labelled "duration_hrs"

In [18]:
main_df['duration_hr']=main_df['duration']/(60*60)                  # converted duration to hours by dividing by 3600
main_df['duration_hr']=main_df['duration_hr'].astype(float)         # stored time as hours in float datatype
main_df['duration_hr']=main_df['duration_hr'].round(decimals=2)     # rounded to nearest two decimal places

In [22]:
main_df.head(1)

Unnamed: 0,name,title,description,main_speaker,speaker_occupation,num_speaker,duration,duration_hr,event,film_date,published_date,comments,tags,languages,ratings,related_talks,url,views
0,Ken Robinson: Do schools kill creativity?,do schools kill creativity?,sir ken robinson makes an entertaining and pro...,Ken Robinson,Author/educator,1,1164,0.32,TED2006,24-02-2006,26-06-2006,4553,"['children', 'creativity', 'culture', 'dance',...",60,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",https://www.ted.com/talks/ken_robinson_says_sc...,47227110


### Converting Upper case letters in "description" and "title" to lower case

In [15]:
main_df['description']=main_df['description'].str.lower()
main_df['title']=main_df['title'].str.lower()

In [20]:
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

### Re-ordering the data frame for better ordering of features according to personal preference

In [21]:
main_df = main_df[['name', 'title', 'description', 'main_speaker', 'speaker_occupation', 'num_speaker', 'duration','duration_hr', 'event', 'film_date', 'published_date', 'comments', 'tags', 'languages', 'ratings', 'related_talks', 'url', 'views']]

### Copying Dataframe for EDA

In [27]:
md_copy = main_df.copy()

In [28]:
md_copy.dtypes

name                   object
title                  object
description            object
main_speaker           object
speaker_occupation     object
num_speaker             int64
duration                int64
duration_hr           float64
event                  object
film_date              object
published_date         object
comments                int64
tags                   object
languages               int64
ratings                object
related_talks          object
url                    object
views                   int64
dtype: object

### Removing HTML tags from the description text

In [33]:
def tag_removal(string):
    result = re.sub('<.*?>','',string)
    return result

md_copy['description']=md_copy['description'].apply(lambda cw : tag_removal(cw))

## 2. Exploratory Data Analysis 

In [34]:
md_copy.head(5)

Unnamed: 0,name,title,description,main_speaker,speaker_occupation,num_speaker,duration,duration_hr,event,film_date,published_date,comments,tags,languages,ratings,related_talks,url,views
0,Ken Robinson: Do schools kill creativity?,do schools kill creativity?,sir ken robinson makes an entertaining and pro...,Ken Robinson,Author/educator,1,1164,0.32,TED2006,24-02-2006,26-06-2006,4553,"['children', 'creativity', 'culture', 'dance',...",60,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,Al Gore: Averting the climate crisis,averting the climate crisis,with the same humor and humanity he exuded in ...,Al Gore,Climate advocate,1,977,0.27,TED2006,24-02-2006,26-06-2006,265,"['alternative energy', 'cars', 'climate change...",43,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,David Pogue: Simplicity sells,simplicity sells,new york times columnist david pogue takes aim...,David Pogue,Technology columnist,1,1286,0.36,TED2006,23-02-2006,26-06-2006,124,"['computers', 'entertainment', 'interface desi...",26,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",https://www.ted.com/talks/david_pogue_says_sim...,1636292
3,Majora Carter: Greening the ghetto,greening the ghetto,"in an emotionally charged talk, macarthur-winn...",Majora Carter,Activist for environmental justice,1,1116,0.31,TED2006,25-02-2006,26-06-2006,200,"['MacArthur grant', 'activism', 'business', 'c...",35,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",https://www.ted.com/talks/majora_carter_s_tale...,1697550
4,Hans Rosling: The best stats you've ever seen,the best stats you've ever seen,you've never seen data presented like this. wi...,Hans Rosling,Global health expert; data visionary,1,1190,0.33,TED2006,21-02-2006,27-06-2006,593,"['Africa', 'Asia', 'Google', 'demo', 'economic...",48,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",https://www.ted.com/talks/hans_rosling_shows_t...,12005869
