In [1]:
import nltk
import csv
import pandas as pd
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
from nltk.corpus import stopwords

### Import dataset as pandas dataframe. Explore data

In [2]:
ted = pd.read_csv('ted_main.csv', encoding='utf-8')
print(f'Numnber of rows and columns of the ted data: {ted.shape}')
#Read data as Pandas dataframe and get number of rows and columns. 

Numnber of rows and columns of the ted data: (2550, 17)


In [3]:
ted.head(3)
#Get the first 3 rows. 

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292


### Extract 'description' 2nd column.

In [4]:
tedDesc = ted['description']
print(f'Number of row of the description column: {tedDesc.shape}')

Number of row of the description column: (2550,)


In [5]:
tedDesc[1]

'With the same humor and humanity he exuded in "An Inconvenient Truth," Al Gore spells out 15 ways that individuals can address climate change immediately, from buying a hybrid to inventing a new, hotter brand name for global warming.'

### [A] Word tokenization (only).

#### (1) Total # of tokens.

In [6]:
#1st loop: For each description in tedDesc, apply word tokenization. desc is a list.
#2nd loop: For each token in desc list, extend it to the vocabList1 token accumulator.
vocabList1 = [token for desc in tedDesc.apply(word_tokenize) for token in desc]

print(f'Total # of tokens - word tokenization only: {len(vocabList1)}')

Total # of tokens - word tokenization only: 152000


#### (2) Size of vocabulary.

In [7]:
#Count frequencies of the vocabulary terms.
vocabDict1 = nltk.FreqDist(vocabList1)

#Get unique vocabulary.
uniqueVocab1 = list(vocabDict1.keys())

#Number of unique tokens.
print(f'Size of vocabulary - word tokenization only: {len(uniqueVocab1)}')

Size of vocabulary - word tokenization only: 17878


#### (3) Top 30 most common token types with frequency (list in descending order of frequency).

In [8]:
top30Common1 = vocabDict1.most_common(30)

print('Top 30 most common token types - word tokenization only:')
print(top30Common1)

Top 30 most common token types - word tokenization only:
[(',', 7382), ('.', 5764), ('the', 5395), ('and', 4264), ('of', 3651), ('to', 3528), ('a', 3505), ('in', 1762), ('--', 1485), ('that', 1472), ("'s", 1217), ('for', 1140), ('``', 898), ("''", 893), ('with', 879), ('we', 878), ('is', 834), ('it', 833), ('?', 824), ('this', 812), ('In', 762), ('on', 761), ('how', 746), ('he', 738), ('from', 707), ('talk', 693), ('his', 686), (':', 643), ('about', 630), ('as', 605)]


#### Write output to a file.

In [9]:
with open('top30-PartA.csv', 'w', newline='', encoding='utf-8') as outFile:
    csvWriter = csv.writer(outFile)
    #Header
    csvWriter.writerow(["Token", "Frequency"])
    #Data
    csvWriter.writerows(top30Common1)

print(f"Successfully written!")

Successfully written!


#### (4) Percentage of tokens in the dataset that is covered by the top 30 token types.

In [10]:
#Get total number of tokens of the top 30. 
totalTokens1_top30 = sum(frequency for token, frequency in top30Common1)
print(f'Total number of tokens for top 30: {totalTokens1_top30}')

#Percentage covered by top 30.
top30Perc1 = (totalTokens1_top30 / len(vocabList1)) * 100
print(f'Percentage of tokens covered by the top 30 token types - word tokenization only: {top30Perc1:.3f}%')

Total number of tokens for top 30: 54387
Percentage of tokens covered by the top 30 token types - word tokenization only: 35.781%


#### Breakdown chart.

In [11]:
top30tokenPerc1 = [(token, (frequency / len(vocabList1)) * 100) for token, frequency in top30Common1]

#Convert to pandas data frame.
df_top30tokenPerc1 = pd.DataFrame(top30tokenPerc1, columns=['Token', 'Percentage'])
df_top30tokenPerc1

Unnamed: 0,Token,Percentage
0,",",4.856579
1,.,3.792105
2,the,3.549342
3,and,2.805263
4,of,2.401974
5,to,2.321053
6,a,2.305921
7,in,1.159211
8,--,0.976974
9,that,0.968421


### [B] Word tokenization + Case folding (lower-case) + Stopword filtering + Non-alphabet filtering.

#### (1) Total # of tokens.

In [12]:
#Set to store English stop word for efficiency.
stopwordsSet = set(stopwords.words('english'))

#1st loop: For each description in tedDesc, apply case folding & word tokenization. desc is a list.
#2nd loop: For each token in desc list, filtering if the token is non-alphabet and not a stop-word.
vocabList2 = [token for desc in tedDesc for token in word_tokenize(desc.lower())
    if token.isalpha() and token not in stopwordsSet]

print(f'Total # of tokens - tokenization/lower-case/non stop-word/alphabet: {len(vocabList2)}')

Total # of tokens - tokenization/lower-case/non stop-word/alphabet: 74131


#### (2) Size of vocabulary.

In [13]:
#Count frequencies of the vocabulary terms.
vocabDict2 = nltk.FreqDist(vocabList2)

#Get unique vocabulary.
uniqueVocab2 = list(vocabDict2.keys())

#Number of unique tokens.
print(f'Size of vocabulary - tokenization/lower-case/non stop-word/alphabet: {len(uniqueVocab2)}')

Size of vocabulary - tokenization/lower-case/non stop-word/alphabet: 14714


#### (3) Top 30 most common token types with frequency (list in descending order of frequency).

In [14]:
top30Common2 = vocabDict2.most_common(30)

print('Top 30 most common token types - tokenization/lower-case/non stop-word/alphabet:')
print(top30Common2)

Top 30 most common token types - tokenization/lower-case/non stop-word/alphabet:
[('talk', 700), ('us', 643), ('world', 515), ('new', 415), ('says', 411), ('people', 332), ('shares', 326), ('shows', 282), ('life', 274), ('one', 272), ('ted', 254), ('like', 251), ('make', 239), ('way', 227), ('human', 205), ('work', 203), ('could', 200), ('help', 184), ('even', 179), ('story', 179), ('time', 168), ('years', 163), ('makes', 153), ('talks', 148), ('data', 142), ('future', 142), ('change', 140), ('powerful', 139), ('know', 133), ('two', 130)]


#### Write output to a file.

In [15]:
with open('top30-PartB.csv', 'w', newline='', encoding='utf-8') as outFile:
    csvWriter = csv.writer(outFile)
    #Header
    csvWriter.writerow(["Token", "Frequency"])
    #Data
    csvWriter.writerows(top30Common2)

print(f"Successfully written!")

Successfully written!


#### (4) Percentage of tokens in the dataset that is covered by the top 30 token types.

In [16]:
#Get total number of tokens of the top 30. 
totalTokens2_top30 = sum(frequency for token, frequency in top30Common2)
print(f'Total number of tokens for top 30: {totalTokens2_top30}')

#Percentage covered by top 30.
top30Perc2 = (totalTokens2_top30 / len(vocabList2)) * 100
print(f'Percentage of tokens covered by the top 30 token types - tokenization/lower-case/non stop-word/alphabet: {top30Perc2:.3f}%')

Total number of tokens for top 30: 7749
Percentage of tokens covered by the top 30 token types - tokenization/lower-case/non stop-word/alphabet: 10.453%


#### Breakdown chart.

In [17]:
top30tokenPerc2 = [(token, (frequency / len(vocabList2)) * 100) for token, frequency in top30Common2]

#Convert to pandas data frame.
df_top30tokenPerc2 = pd.DataFrame(top30tokenPerc2, columns=['Token', 'Percentage'])
df_top30tokenPerc2

Unnamed: 0,Token,Percentage
0,talk,0.944274
1,us,0.867383
2,world,0.694716
3,new,0.55982
4,says,0.554424
5,people,0.447856
6,shares,0.439762
7,shows,0.380408
8,life,0.369616
9,one,0.366918


### [C] Word tokenization + Case folding (lower-case) + Stopword filtering + Non-alphabet filtering + Porter stemming.

#### (1) Total # of tokens.

In [18]:
#Set to store English stop word for efficiency.
stopwordsSet = set(stopwords.words('english'))

#Porter stemmer object.
porterStemmer = nltk.PorterStemmer()

#1st loop: For each description in tedDesc, apply case folding & word tokenization. desc is a list.
#2nd loop: For each token in desc list, filtering if the token is non-alphabet and not a stop-word.
vocabList3 = [porterStemmer.stem(token) for desc in tedDesc for token in word_tokenize(desc.lower())
    if token.isalpha() and token not in stopwordsSet]

print(f'Total # of tokens - tokenization/lower-case/non stop-word/alphabet/stemmed: {len(vocabList3)}')

Total # of tokens - tokenization/lower-case/non stop-word/alphabet/stemmed: 74131


#### (2) Size of vocabulary.

In [19]:
#Count frequencies of the vocabulary terms.
vocabDict3 = nltk.FreqDist(vocabList3)

#Get unique vocabulary.
uniqueVocab3 = list(vocabDict3.keys())

#Number of unique tokens.
print(f'Size of vocabulary - tokenization/lower-case/non stop-word/alphabet/stemmed: {len(uniqueVocab3)}')

Size of vocabulary - tokenization/lower-case/non stop-word/alphabet/stemmed: 10592


#### (3) Top 30 most common token types with frequency (list in descending order of frequency).

In [20]:
top30Common3 = vocabDict3.most_common(30)

print('Top 30 most common token types - tokenization/lower-case/non stop-word/alphabet/stemmed:')
print(top30Common3)

Top 30 most common token types - tokenization/lower-case/non stop-word/alphabet/stemmed:
[('talk', 880), ('us', 643), ('world', 527), ('say', 453), ('make', 449), ('share', 444), ('new', 415), ('show', 371), ('use', 360), ('work', 356), ('peopl', 334), ('human', 330), ('way', 326), ('one', 307), ('stori', 307), ('live', 282), ('help', 281), ('life', 274), ('like', 272), ('power', 262), ('ted', 254), ('design', 240), ('take', 223), ('learn', 221), ('look', 219), ('time', 213), ('year', 210), ('think', 204), ('creat', 203), ('could', 200)]


#### Write output to a file.

In [25]:
with open('top30-PartC.csv', 'w', newline='', encoding='utf-8') as outFile:
    csvWriter = csv.writer(outFile)
    #Header
    csvWriter.writerow(["Token", "Frequency"])
    #Data
    csvWriter.writerows(top30Common3)

print(f"Successfully written!")

Successfully written!


#### (4) Percentage of tokens in the dataset that is covered by the top 30 token types.

In [22]:
#Get total number of tokens of the top 30. 
totalTokens3_top30 = sum(frequency for token, frequency in top30Common3)
print(f'Total number of tokens for top 30: {totalTokens3_top30}')

#Percentage covered by top 30.
top30Perc3 = (totalTokens3_top30 / len(vocabList3)) * 100
print(f'Percentage of tokens covered by the top 30 token types - tokenization/lower-case/non stop-word/alphabet/stemmed: {top30Perc3:.3f}%')

Total number of tokens for top 30: 10060
Percentage of tokens covered by the top 30 token types - tokenization/lower-case/non stop-word/alphabet/stemmed: 13.571%


#### Breakdown chart.

In [23]:
top30tokenPerc3 = [(token, (frequency / len(vocabList3)) * 100) for token, frequency in top30Common3]

#Convert to pandas data frame.
df_top30tokenPerc3 = pd.DataFrame(top30tokenPerc3, columns=['Token', 'Percentage'])
df_top30tokenPerc3

Unnamed: 0,Token,Percentage
0,talk,1.187088
1,us,0.867383
2,world,0.710904
3,say,0.61108
4,make,0.605685
5,share,0.59894
6,new,0.55982
7,show,0.500465
8,use,0.485627
9,work,0.480231
