In [None]:
# Paragraph & Token Delimiters
para_pat = r'\n+'
token_pat = r'([\W_]+)'
db_file = 'ted-talks.db' # Output sqlite filename

# OCHO properties
OHCO = ['speaker', 'event', 'id', 'para_num', 'sent_num', 'token_num']
AUTHS = OHCO[:1]
BOOKS = OHCO[:2]
CHAPS = OHCO[:3]
PARAS = OHCO[:4]
SENTS = OHCO[:5]

# Libraries

In [None]:
import sqlite3
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')

<h1><a href="www.theodorespeaks.com">Theodore Speaks Dataset</a></h1>

In [None]:
ot_df = pd.read_csv("ted-talks/TED_Talks_by_ID_plus-transcripts-and-LIWC-and-MFT-plus-views.csv")
ot_df.columns = np.char.lower(ot_df.columns.values.astype(str))
ot_df["date_published"] = pd.to_datetime(ot_df["date_published"])
ot_df["views"] = ot_df.pop("views_as_of_06162017")
ot_df["duration"] = pd.to_timedelta(ot_df["duration"], unit='s').dt.seconds

# Self-Scraped Dataset

In [None]:
scrape = pd.read_csv("ted-talks/ted-talks-scraped.csv").set_index("id")
scrape_liwc = pd.read_csv("ted-talks/ted-talks-scraped-LIWC.csv")
scrape_liwc["id"] = scrape_liwc.pop("Source (F)")

print(list(scrape.columns.values))

liwc_col = list(scrape_liwc.columns.values)
liwc_col = liwc_col[liwc_col.index("WC"):]
scrape_liwc = scrape_liwc[liwc_col]

scrape_df = scrape.join(scrape_liwc.set_index('id')).reset_index()

scrape_df["speaker"] = scrape_df.pop("main_speaker")
scrape_df["date_published"] = pd.to_datetime(pd.to_datetime(scrape_df.pop("published_date"), unit='s').dt.date)

# Merge Datasets

In [None]:
ted_talks = pd.merge(scrape_df, ot_df, how="outer", on=list(ot_df.columns.intersection(scrape_df.columns)))
ted_talks = ted_talks.sort_values('id').drop_duplicates(subset='id', keep='first')
ted_talks["events"] = ted_talks["transcript"].str.extractall(r'(\([^)]*\))').unstack().apply(lambda x:','.join(x.dropna()), axis=1).str.replace(r"\(|\)","").str.split(",")
ted_talks["transcript"] = ted_talks["transcript"].str.replace(r'\([^)]*\)|([01][0-9]):[0-5][0-9]|([0-9]):[0-5][0-9]', '', regex=True)

In [None]:
ted_talks[scrape_df.columns.values].head()

In [None]:
with open('interactive-visual/ted-talks.json', 'w') as f:
    f.write(ted_talks[scrape_df.columns.values].to_json(orient='records'))

<h1>Create F4 compliant version of corpus</h1>

In [None]:
T = ted_talks[["speaker","event","id","transcript"]]

try:
    T = T.set_index(CHAPS)
    T = T.sort_index()
except KeyError:
    pass

T = T[T["transcript"].str.strip() != ""]

In [None]:
paras = T.transcript.str.split(para_pat, expand=True).stack().to_frame().rename(columns={0:'para_str'})
paras.index.names = PARAS
paras.para_str = paras.para_str.str.strip()
paras.para_str = paras.para_str.str.replace(r'\n', ' ')
paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
paras = paras[~paras.para_str.str.match(r'^\s*$')]

In [None]:
sents = paras.para_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x))).stack().to_frame().rename(columns={0:'sent_str'})
sents.index.names = SENTS
del(paras)

In [None]:
tokenizer = RegexpTokenizer('\s+', gaps=True)
tokens = sents.sent_str.apply(lambda x: pd.Series(nltk.pos_tag(tokenizer.tokenize(x)))).stack().to_frame().rename(columns={0:'pos_tuple'})
tokens.index.names = OHCO
tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
tokens = tokens.drop('pos_tuple', 1)
tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
tokens['num'] = tokens.token_str.str.match(r'^.*\d.*$').astype('int')
tokens.loc[(tokens.punc == 0) & (tokens.num == 0), 'term_str'] = tokens.token_str.str.lower().str.replace(token_pat, '')
del(sents)

In [None]:
vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame().reset_index().rename(columns={'index':'term_str', 'term_str':'n'})
vocab = vocab.sort_values('term_str').reset_index(drop=True)
vocab['p'] = vocab.n / vocab.n.sum()
vocab.index.name = 'term_id'

In [None]:
stemmer = nltk.stem.porter.PorterStemmer()
vocab['port_stem'] = vocab.term_str.apply(lambda x: stemmer.stem(x))

In [None]:
sw = pd.DataFrame({'x':1}, index=set(nltk.corpus.stopwords.words('english')))
vocab['stop'] = vocab.term_str.map(sw.x).fillna(0).astype('int')

In [None]:
tokens['term_id'] = tokens['term_str'].map(vocab.reset_index().set_index('term_str').term_id).fillna(-1).astype('int')

In [None]:
tokens.head()
vocab.head()

In [None]:
with sqlite3.connect(db_file) as db:
    T.to_sql('doc', db, if_exists='replace', index=True)
    tokens.to_sql('token', db, if_exists='replace', index=True)
    vocab.to_sql('vocab', db, if_exists='replace', index=True)