# Analysis

In [None]:
!pip install textstat
!pip install lexicalrichness
!pip install textblob
!pip install spacy

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import statistics
import textstat
from lexicalrichness import LexicalRichness
import re
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting pretty figures and avoid blurry images
%config InlineBackend.figure_format = 'retina'

## Load Data

In [None]:
ted = pd.read_csv('../data/interim/ted_preprocessed.csv')

# filter relevant variables
ted = ted[['talk_name', 'event', 'event_cat', 'views', 'duration', 'tags', 'p_year', 'p_month',
    'transcript', 'syll', 'words', 'sent']]

## Compute Measures

### Complexity Measures

Complexity measures computed for this analysis are
- Syntactic complexity measures
    - Median number of words per sentence(MWS)
    - Flesch-Kincaid grade level readability measure(FKL)
- Semantic complexity measures
    - Measure of textual lexical diversity (MTLD)

### Fluency Measures

- Syllables per minute(SPM)

### TED-specific Measures

- Laughter Frequency (LF): The mean number of seconds between each time the audience laughs 
- Pronominal Measures (PM): The ratio of the number of times a speaker says “I”, “you” or “we” to the total length of the talk in seconds
- Numerical Info Proportion (NIP): The ratio of the number of times a speaker uses a numerical word(relating to a number) to the total length of the talk in seconds




In [None]:
#-----complexity measures-----
def mws(text):
    """Compute median number of words per sentence"""

    sentences = re.split(r' *[\.\?!][\'"\)\]]*[ |\n](?=[A-Z])', text)
    word_cnt = [textstat.lexicon_count(sent) for sent in sentences]
    return statistics.median(word_cnt)

def fkl(text):
    """Compute Flesch-Kincaid grade level readability measure"""

    return textstat.flesch_kincaid_grade(text)

def mtld(text):
    """Measure of textual lexical diversity"""

    lex = LexicalRichness(text)
    return lex.mtld()

#-----Laughter Frequency-----
def laughs(text):
    """Number of laughs"""

    return (len(re.findall('(Laughter)', text)))

#-----Pronominal measures-----
def pm_i(text):
    """Number of times 'I' was used"""

    words = text.split()
    cnt = len([word for word in words if word=='I'])
    return cnt

def pm_we(text):
    """Number of times 'we' was used"""

    text = text.lower()
    cnt = len(re.findall('we[.|(]*', text))
    return cnt

def pm_you(text):
    """Number of times 'you' was used"""

    text = text.lower()
    cnt = len(re.findall('you[.|(]*', text))
    return cnt

In [None]:
ted['mws'] = ted['transcript'].apply(mws)
ted['fkl'] = ted['transcript'].apply(fkl)
ted['mtld'] = ted['transcript'].apply(mtld)

ted['cnt_laughs'] = ted['transcript'].apply(laughs)
ted['cnt_i'] = ted['transcript'].apply(pm_i)
ted['cnt_we'] = ted['transcript'].apply(pm_we)
ted['cnt_you'] = ted['transcript'].apply(pm_you)

ted['lf'] = round(ted.cnt_laughs / ted.duration, 2)
ted['pm_i'] = round(ted.cnt_i / ted.duration, 2)
ted['pm_we'] = round(ted.cnt_we / ted.duration, 2) 
ted['pm_you'] = round(ted.cnt_you / ted.duration, 2)

ted['spm'] = round(ted.syll / ted.duration / 60, 2)

In [None]:
ted.head()

### Compute NIP measure using Named Entity Recognition from spaCy

In [None]:
# run python -m spacy download en_core_web_lg
# store the path of this into `path_to_trained_spacy_pipeline`

import spacy
nlp = spacy.load(path_to_trained_spacy_pipeline)

In [None]:
def get_nip_len(x):

    # convert to document
    doc = nlp(x)

    # NER dictionary
    ner_dict = defaultdict(list)
    for ent in doc.ents:
        ner_dict[ent.label_].append(ent.text)

    # aggregate components
    percent = ner_dict['PERCENT']
    quant = ner_dict['QUANTITY']
    ordinal = ner_dict['ORDINAL']
    cardinal = ner_dict['CARDINAL']

    return (len(percent + quant + ordinal + cardinal))


In [None]:
# compute nip
ted['nip_comps_cnt'] = ted['transcript'].apply(get_nip_len)
ted['nip'] = round(ted['nip_comps_cnt'] / ted['duration'], 2)

In [None]:
# save the data with the newly computed measures
ted[['talk_name', 'event', 'event_cat', 'views', 'tags', 'p_year', 'transcript', 'mws',
    'fkl', 'mtld', 'cnt_laughs', 'cnt_i', 'cnt_we', 'cnt_you', 'lf', 'pm_i', 'pm_we', 'pm_you',
    'nip_comps_cnt', 'nip']].to_csv('../data/processed/ted_measures.csv', index=False)