In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/jre_elon_musk.csv")
df.head()

Unnamed: 0,Timestamp,Speaker,Text
0,[00:00:00],Joe Rogan,"Ah, ha, ha, ha. Four, three, two, one, boom. T..."
1,[00:00:09],Elon Musk,You're welcome.
2,[00:00:10],Joe Rogan,It's very good to meet you.
3,[00:00:11],Elon Musk,Nice to meet you too.
4,[00:00:12],Joe Rogan,And thanks for not lighting this place on fire.


In [5]:
# Print the first ten lines of text
for line in df.loc[:10,'Text']:
    print(line)

Ah, ha, ha, ha. Four, three, two, one, boom. Thank you. Thanks for doing this, man. Really appreciate it.
You're welcome.
It's very good to meet you.
Nice to meet you too.
And thanks for not lighting this place on fire.
You're welcome. That's coming later.
How does one, just in the middle of doing all the things you do, create cars, rockets, all the stuff you're doing,constantly innovating, decide to just make a flamethrower? Where do you have the time for that?
Well, the flame, we didn't put a lot of time into the flamethrower. This was an off-the-cuff thing. It's sort of a hobbycompany called the Boring Company, which started out as a joke, and we decided to make a real, and dig a tunnelunder LA. And then, other people asked us to dig tunnels. And so, we said yes in a few cases.
Now, who-
And then, we have a merchandise section that only has one piece of merchandise at a time. And we started off witha cap. And there was only one thing on, which is BoringCompany.com/hat. That's it. An

In [14]:
import re
re.findall("\d+", df.loc[:0,'Timestamp'][0])

['00', '00', '00']

In [15]:
# Create a function to convert timestamp into seconds
import re
def convert_timestamp_into_seconds(timestamp):
    hour, minu, secs = re.findall("\d+", timestamp)
    return int(secs) + int(minu)*60 + int(hour)*3600

In [16]:
# Convert the Timestamp column using the function defined above

df["Timestamp"] = df["Timestamp"].apply(convert_timestamp_into_seconds)

In [17]:
df

Unnamed: 0,Timestamp,Speaker,Text
0,0,Joe Rogan,"Ah, ha, ha, ha. Four, three, two, one, boom. T..."
1,9,Elon Musk,You're welcome.
2,10,Joe Rogan,It's very good to meet you.
3,11,Elon Musk,Nice to meet you too.
4,12,Joe Rogan,And thanks for not lighting this place on fire.
...,...,...,...
1826,9401,Joe Rogan,"I believe it's true too. So, thank you."
1827,9403,Elon Musk,You're welcome.
1828,9404,Joe Rogan,"All you assholes out there, be nice. Be nice, ..."
1829,9410,Elon Musk,"All right, thank you."


In [53]:
# Add a column with the seconds lasted by the text in the row
# for example: the first row lasts 9 seconds, since Elon Musk
# answer at the second 9 (Hint: use shift with period - 1).
# In case of negative numbers, just make them 1 (minimum interval lenght is 1 second)
import numpy as np
def max1(x):
    return np.maximum(1,x)


df["Interval"] = max1(df["Timestamp"].shift(periods=-1)-df["Timestamp"])

In [54]:
df.Speaker.unique()

array(['Joe Rogan', 'Elon Musk', 'Jaime'], dtype=object)

In [55]:
# Total seconds spoken by Joe Rogan
df[df['Speaker']=="Joe Rogan"]['Interval'].sum()


4636.0

In [56]:
# Total seconds spoken by Elon Musk
df[df['Speaker']=="Elon Musk"]['Interval'].sum()


5075.0

In [57]:
# Total seconds spoken by Jaime
df[df['Speaker']=="Jaime"]['Interval'].sum()


45.0

In [58]:
# Average speaking interval for each person
df.groupby(['Speaker'])['Interval'].mean()

Speaker
Elon Musk    5.583058
Jaime        2.647059
Joe Rogan    5.128319
Name: Interval, dtype: float64

Who speaks faster?

Tokenize the text, preprocess the tokens so that you have only words (excluding punctuation) and compute the velocity of each speaker as: number of words per interval / length of interval . Store the result in a column named Velocity and compute the average for each speaker.

In [59]:
# Preprocess the data
import spacy

nlp = spacy.load("en_core_web_sm")

# Create a function to remove punctuation from text

def remove_punctuation(text):
    text = ' '.join([token.text for token in nlp(text) if not token.is_punct])
    return text
        

# Create a function to count the non punctuation token of a text

def count_tokens(text):
    return len(nlp(text))
    
# Create a function to remove stop words from text
    
def remove_stopwords(text):
    text = ' '.join([token.text for token in nlp(text) if not token.is_stop])
    return nlp(text)

df["TextNoPunct"] = df["Text"].apply(remove_punctuation)

In [60]:
# Put the number of tokens of each row in a new column
df["n_tokens"] = df["TextNoPunct"].apply(count_tokens)

In [61]:
df

Unnamed: 0,Timestamp,Speaker,Text,Interval,TextNoPunct,n_tokens,Velocity
0,0,Joe Rogan,"Ah, ha, ha, ha. Four, three, two, one, boom. T...",9.0,Ah ha ha ha Four three two one boom Thank you ...,19,
1,9,Elon Musk,You're welcome.,1.0,You 're welcome,3,0.333333
2,10,Joe Rogan,It's very good to meet you.,1.0,It 's very good to meet you,7,7.000000
3,11,Elon Musk,Nice to meet you too.,1.0,Nice to meet you too,5,5.000000
4,12,Joe Rogan,And thanks for not lighting this place on fire.,1.0,And thanks for not lighting this place on fire,9,9.000000
...,...,...,...,...,...,...,...
1826,9401,Joe Rogan,"I believe it's true too. So, thank you.",2.0,I believe it 's true too So thank you,9,9.000000
1827,9403,Elon Musk,You're welcome.,1.0,You 're welcome,3,1.500000
1828,9404,Joe Rogan,"All you assholes out there, be nice. Be nice, ...",6.0,All you assholes out there be nice Be nice bit...,18,18.000000
1829,9410,Elon Musk,"All right, thank you.",1.0,All right thank you,4,0.666667


In [62]:
# Compute the velocity and store it in a new column

df["Velocity"] = df["n_tokens"]/df["Interval"]

In [63]:
df

Unnamed: 0,Timestamp,Speaker,Text,Interval,TextNoPunct,n_tokens,Velocity
0,0,Joe Rogan,"Ah, ha, ha, ha. Four, three, two, one, boom. T...",9.0,Ah ha ha ha Four three two one boom Thank you ...,19,2.111111
1,9,Elon Musk,You're welcome.,1.0,You 're welcome,3,3.000000
2,10,Joe Rogan,It's very good to meet you.,1.0,It 's very good to meet you,7,7.000000
3,11,Elon Musk,Nice to meet you too.,1.0,Nice to meet you too,5,5.000000
4,12,Joe Rogan,And thanks for not lighting this place on fire.,1.0,And thanks for not lighting this place on fire,9,9.000000
...,...,...,...,...,...,...,...
1826,9401,Joe Rogan,"I believe it's true too. So, thank you.",2.0,I believe it 's true too So thank you,9,4.500000
1827,9403,Elon Musk,You're welcome.,1.0,You 're welcome,3,3.000000
1828,9404,Joe Rogan,"All you assholes out there, be nice. Be nice, ...",6.0,All you assholes out there be nice Be nice bit...,18,3.000000
1829,9410,Elon Musk,"All right, thank you.",1.0,All right thank you,4,4.000000


In [64]:
# Inspect the avg velocity of each speaker
df.groupby(['Speaker'])['Velocity'].mean()

Speaker
Elon Musk    2.885230
Jaime        3.630031
Joe Rogan    3.045107
Name: Velocity, dtype: float64

After all, Elon was high...

![image](https://bsmedia.business-standard.com/_media/bs/img/article/2018-09/09/full/1536463138-6668.jpg)

In [None]:
# 