# Preparing data

Loading libraries 

In [45]:
import pandas as pd
import numpy as np
import re
import string
import datetime
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer 
# Sentiment, lemmatization
from textblob import TextBlob
from textblob import Word
from operator import itemgetter
import warnings 
warnings.filterwarnings("ignore")

## Loading data
The data was obtained from the site:  
https://www.rev.com/blog/transcripts/donald-trump-joe-biden-1st-presidential-debate-transcript-2020  
https://www.rev.com/blog/transcripts/donald-trump-joe-biden-final-presidential-debate-transcript-2020

In [4]:
debate_1 = pd.read_pickle('data/Scrap_Trump_Biden_debate_I.pkl')
debate_2 = pd.read_pickle('data/Scrap_Trump_Biden_debate_II.pkl')

In [6]:
debate_1.head(3)

Unnamed: 0,name,time,statement,debate,part
0,Chris Wallace,01:20,Good evening from the Health Education Campus ...,1,1
1,Chris Wallace,02:10,This debate is being conducted under health an...,1,1
2,Vice President Joe Biden,02:49,"How you doing, man?",1,1


In [7]:
debate_2.head(3)

Unnamed: 0,name,time,statement,debate,part
0,Kristen Welker,00:18,"Good evening, everyone. Good evening. Thank yo...",2,1
1,Donald Trump,07:37,How are you doing? How are you?,2,1
2,Kristen Welker,07:58,And I do want to say a very good evening to bo...,2,1


In [8]:
debate_1.name = debate_1.name.apply(lambda x: 'Joe Biden' if x=='Vice President Joe Biden' else x)
debate_1.name = debate_1.name.apply(lambda x: 'Donald Trump' if x=='President Donald J. Trump' else x)

### Calculation of speaking time

In [10]:
def time_to_sec(time):
    y = time.split(':')
    if len(y)==3:
        seconds = int(y[2])
        minutes = int(y[1])*60
        hours = int(y[0])*3600
        return seconds+minutes+hours
    else:
        seconds = int(y[1])
        minutes = int(y[0])*60
        return seconds+minutes

In [11]:
for x in range(1,178):
    t = int(time_to_sec(debate_1.time[x+1])) - int(time_to_sec(debate_1.time[x]))
    if t==0:
        t=1
    debate_1.loc[x,'seconds'] = t

for x in range(179,789):
    t = int(time_to_sec(debate_1.time[x+1])) - int(time_to_sec(debate_1.time[x]))
    if t==0:
        t=1
    debate_1.loc[x,'seconds'] = t

debate_1.loc[0,'seconds'] = 80
debate_1.loc[178,'seconds'] = 1
debate_1.loc[789,'seconds'] = 60

for x in range(1,336):
    t = int(time_to_sec(debate_2.time[x+1])) - int(time_to_sec(debate_2.time[x]))
    if t==0:
        t=1
    debate_2.loc[x,'seconds'] = t

for x in range(337,511):
    t = int(time_to_sec(debate_2.time[x+1])) - int(time_to_sec(debate_2.time[x]))
    if t==0:
        t=1
    debate_2.loc[x,'seconds'] = t

debate_2.loc[0,'seconds'] = 18
debate_2.loc[88,'seconds'] = 1
debate_2.loc[336,'seconds'] = 1
debate_2.loc[511,'seconds'] = 1

In [12]:
debate_1.head(3)

Unnamed: 0,name,time,statement,debate,part,seconds
0,Chris Wallace,01:20,Good evening from the Health Education Campus ...,1,1,80.0
1,Chris Wallace,02:10,This debate is being conducted under health an...,1,1,39.0
2,Joe Biden,02:49,"How you doing, man?",1,1,2.0


## Definitions for clearing data

In [13]:
def removing(txt):
    pattern = r'\[.*?\]'
    return re.sub(pattern, ' ', txt)

In [14]:
def all_sentences(txt):
    return len(TextBlob(txt).sentences)

In [15]:
def all_words(txt):
    return len(TextBlob(txt).words)

In [16]:
def color(txt):
    if txt=='Joe Biden':
        return 'blue'
    elif txt=='Donald Trump':
        return 'red'
    else:
        return 'grey'

In [17]:
def clean_text_round1(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [18]:
def clean_text_round2(text):
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [19]:
stop_words = stopwords.words('english')
def lemat(sentence,stem=False, lemmatize=True):
    s = sentence.lower()
    s = re.sub("[^a-zA-Z]", " ", s)
    s = word_tokenize(s) 

    clear_post = []
    for word in s:
        if (not word in stop_words) and (len(word)>2):
            if stem:
                clear_post.append(porter_stremmer.stem(word))
            elif lemmatize:
                clear_post.append(Word(word).lemmatize())
            else:
                clear_post.append(word)
        
    lemmatized_output = ' '.join([w for w in clear_post])
    
    return lemmatized_output

lemat1 = lambda x: lemat(x)

### Clearing data

In [20]:
debate_1.statement = debate_1.statement.apply(lambda x: removing(x))
debate_2.statement = debate_2.statement.apply(lambda x: removing(x))

In [21]:
# calculation of the actual number of words and sentences
debate_1['all_words'] = debate_1.statement.apply(lambda x: all_words(x))
debate_1['all_sentences'] = debate_1.statement.apply(lambda x: all_sentences(x))
debate_2['all_words'] = debate_2.statement.apply(lambda x: all_words(x))
debate_2['all_sentences'] = debate_2.statement.apply(lambda x: all_sentences(x))

In [None]:
# colors of candidates
debate_1['color'] = debate_1.name.apply(lambda x: color(x))
debate_2['color'] = debate_2.name.apply(lambda x: color(x))

In [None]:
debate_1.head(3)

Unnamed: 0,name,time,statement,debate,part,seconds,all_words,all_sentences,color
0,Chris Wallace,01:20,Good evening from the Health Education Campus ...,1,1,80.0,126,7,grey
1,Chris Wallace,02:10,This debate is being conducted under health an...,1,1,39.0,104,5,grey
2,Joe Biden,02:49,"How you doing, man?",1,1,2.0,4,1,blue


### Complete transcription preparation

In [24]:
biden1 = debate_1[debate_1.name=='Joe Biden'].reset_index()
trump1 = debate_1[debate_1.name=='Donald Trump'].reset_index()
biden2 = debate_2[debate_2.name=='Joe Biden'].reset_index()
trump2 = debate_2[debate_2.name=='Donald Trump'].reset_index()

In [25]:
Btxt1 = ''
for i in range(0,biden1.shape[0]):
    Btxt1 += ' '+biden1.statement[i]

In [26]:
Btxt2 = ''
for i in range(0,biden2.shape[0]):
    Btxt2 += ' '+biden2.statement[i]

In [27]:
Ttxt1 = ''
for i in range(0,trump1.shape[0]):
    Ttxt1 += ' '+trump1.statement[i]

In [28]:
Ttxt2 = ''
for i in range(0,trump2.shape[0]):
    Ttxt2 += ' '+trump2.statement[i]

In [29]:
diction = {'Biden_I':[Btxt1], 'Trump_I':[Ttxt1], 'Biden_II':[Btxt2], 'Trump_II':[Ttxt2], 'Biden':[Btxt1+' '+Btxt2], 'Trump':[Ttxt1+' '+Ttxt2]}

In [30]:
data_df = pd.DataFrame.from_dict(diction).T
data_df.columns = ['transcript']
data_df

Unnamed: 0,transcript
Biden_I,"How you doing, man? I’m well. Well, first of ..."
Trump_I,"How are you doing? Thank you very much, Chris..."
Biden_II,"220,000 Americans dead. You hear nothing else..."
Trump_II,How are you doing? How are you? So as you kno...
Biden,"How you doing, man? I’m well. Well, first of ..."
Trump,"How are you doing? Thank you very much, Chris..."


In [31]:
data_df['lemat'] = data_df.transcript.apply(lemat1)

In [32]:
data_df

Unnamed: 0,transcript,lemat
Biden_I,"How you doing, man? I’m well. Well, first of ...",man well well first thank looking forward pres...
Trump_I,"How are you doing? Thank you very much, Chris...",thank much chris tell simply election election...
Biden_II,"220,000 Americans dead. You hear nothing else...",american dead hear nothing else say tonight he...
Trump_II,How are you doing? How are you? So as you kno...,know million people modeled expected die close...
Biden,"How you doing, man? I’m well. Well, first of ...",man well well first thank looking forward pres...
Trump,"How are you doing? Thank you very much, Chris...",thank much chris tell simply election election...


### Preparation of transcription with division into partial statements

In [33]:
all_part = pd.concat([debate_1,debate_2]).reset_index()

In [34]:
all_part.drop(['index','time'],axis=1,inplace=True)

In [35]:
all_part

Unnamed: 0,name,statement,debate,part,seconds,all_words,all_sentences,color
0,Chris Wallace,Good evening from the Health Education Campus ...,1,1,80.0,126,7,grey
1,Chris Wallace,This debate is being conducted under health an...,1,1,39.0,104,5,grey
2,Joe Biden,"How you doing, man?",1,1,2.0,4,1,blue
3,Donald Trump,How are you doing?,1,1,1.0,4,1,red
4,Joe Biden,I’m well.,1,1,20.0,4,1,blue
...,...,...,...,...,...,...,...,...
1297,Kristen Welker,"All right. Vice President Biden, same question...",2,2,8.0,25,2,grey
1298,Joe Biden,"I will say, I’m an American President. I repre...",2,2,22.0,87,6,blue
1299,Joe Biden,"We can grow this economy, we can deal with the...",2,2,34.0,115,8,blue
1300,Kristen Welker,"All right, I want to thank you both for a very...",2,2,23.0,66,7,grey


### Partial transcription

In [36]:
clean_part = pd.DataFrame([all_part.name, all_part.statement.apply(round1),all_part.debate,all_part.seconds,all_part.color]).T
clean_part

Unnamed: 0,name,statement,debate,seconds,color
0,Chris Wallace,good evening from the health education campus ...,1,80.0,grey
1,Chris Wallace,this debate is being conducted under health an...,1,39.0,grey
2,Joe Biden,how you doing man,1,2.0,blue
3,Donald Trump,how are you doing,1,1.0,red
4,Joe Biden,i’m well,1,20.0,blue
...,...,...,...,...,...
1297,Kristen Welker,all right vice president biden same question t...,2,8.0,grey
1298,Joe Biden,i will say i’m an american president i represe...,2,22.0,blue
1299,Joe Biden,we can grow this economy we can deal with the ...,2,34.0,blue
1300,Kristen Welker,all right i want to thank you both for a very ...,2,23.0,grey


In [37]:
clean_part = pd.DataFrame([clean_part.name, clean_part.statement.apply(round2),clean_part.debate,clean_part.seconds,all_part.color]).T
clean_part

Unnamed: 0,name,statement,debate,seconds,color
0,Chris Wallace,good evening from the health education campus ...,1,80.0,grey
1,Chris Wallace,this debate is being conducted under health an...,1,39.0,grey
2,Joe Biden,how you doing man,1,2.0,blue
3,Donald Trump,how are you doing,1,1.0,red
4,Joe Biden,im well,1,20.0,blue
...,...,...,...,...,...
1297,Kristen Welker,all right vice president biden same question t...,2,8.0,grey
1298,Joe Biden,i will say im an american president i represen...,2,22.0,blue
1299,Joe Biden,we can grow this economy we can deal with the ...,2,34.0,blue
1300,Kristen Welker,all right i want to thank you both for a very ...,2,23.0,grey


### Total transcription

In [38]:
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
Biden_I,how you doing man i’m well well first of all ...
Trump_I,how are you doing thank you very much chris i...
Biden_II,americans dead you hear nothing else i say t...
Trump_II,how are you doing how are you so as you know ...
Biden,how you doing man i’m well well first of all ...
Trump,how are you doing thank you very much chris i...


In [39]:
data_clean = pd.DataFrame([data_clean.transcript.apply(round2),data_df.lemat]).T
data_clean

Unnamed: 0,transcript,lemat
Biden_I,how you doing man im well well first of all t...,man well well first thank looking forward pres...
Trump_I,how are you doing thank you very much chris i...,thank much chris tell simply election election...
Biden_II,americans dead you hear nothing else i say t...,american dead hear nothing else say tonight he...
Trump_II,how are you doing how are you so as you know ...,know million people modeled expected die close...
Biden,how you doing man im well well first of all t...,man well well first thank looking forward pres...
Trump,how are you doing thank you very much chris i...,thank much chris tell simply election election...


## Organization of data

### Corpus

In [40]:
data_df

Unnamed: 0,transcript,lemat
Biden_I,"How you doing, man? I’m well. Well, first of ...",man well well first thank looking forward pres...
Trump_I,"How are you doing? Thank you very much, Chris...",thank much chris tell simply election election...
Biden_II,"220,000 Americans dead. You hear nothing else...",american dead hear nothing else say tonight he...
Trump_II,How are you doing? How are you? So as you kno...,know million people modeled expected die close...
Biden,"How you doing, man? I’m well. Well, first of ...",man well well first thank looking forward pres...
Trump,"How are you doing? Thank you very much, Chris...",thank much chris tell simply election election...


In [41]:
full_names = ['Joe Biden', 'Donald Trump','Joe Biden', 'Donald Trump','Joe Biden', 'Donald Trump']
colors = ['blue','red','blue','red','blue','red']
data_df['full_name'] = full_names
data_df['colors'] = colors
data_df

Unnamed: 0,transcript,lemat,full_name,colors
Biden_I,"How you doing, man? I’m well. Well, first of ...",man well well first thank looking forward pres...,Joe Biden,blue
Trump_I,"How are you doing? Thank you very much, Chris...",thank much chris tell simply election election...,Donald Trump,red
Biden_II,"220,000 Americans dead. You hear nothing else...",american dead hear nothing else say tonight he...,Joe Biden,blue
Trump_II,How are you doing? How are you? So as you kno...,know million people modeled expected die close...,Donald Trump,red
Biden,"How you doing, man? I’m well. Well, first of ...",man well well first thank looking forward pres...,Joe Biden,blue
Trump,"How are you doing? Thank you very much, Chris...",thank much chris tell simply election election...,Donald Trump,red


### Document-Term Matrix

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean.lemat)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,abide,ability,able,abraham,absolutely,absorbed,abuse,academic,accept,accepted,...,yapping,yeah,year,yes,yet,york,young,younger,zero,zone
Biden_I,0,2,17,0,3,1,0,0,2,3,...,1,5,9,5,0,0,1,0,1,0
Trump_I,0,0,1,0,3,0,0,1,1,0,...,0,2,26,5,0,4,2,1,0,0
Biden_II,1,1,16,2,1,0,0,0,0,0,...,0,0,15,2,1,2,1,0,3,2
Trump_II,0,1,4,6,1,0,2,0,0,0,...,0,1,36,2,2,7,4,0,0,4
Biden,1,3,33,2,4,1,0,0,2,3,...,1,5,24,7,1,2,2,0,4,2
Trump,0,1,5,6,4,0,2,1,1,0,...,0,3,62,7,2,11,6,1,0,4


In [47]:
cv2 = CountVectorizer(stop_words=stop_words, ngram_range=(2,2))
data_cv2 = cv2.fit_transform(data_clean.lemat)
data_dtm2 = pd.DataFrame(data_cv2.toarray(), columns=cv2.get_feature_names())
data_dtm2.index = data_clean.index
data_dtm2

Unnamed: 0,abide agreed,ability lock,ability money,ability take,ability wealth,able afford,able breathe,able bring,able call,able charge,...,younger people,zero emission,zero ether,zero term,zone good,zone got,zone one,zone south,zone tim,zone took
Biden_I,0,0,1,1,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
Trump_I,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
Biden_II,1,0,0,0,1,0,1,0,1,0,...,0,2,1,0,1,0,0,1,0,0
Trump_II,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,1
Biden,1,0,1,1,1,0,1,1,1,1,...,0,2,1,1,1,0,0,1,0,0
Trump,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,1,1,0,1,1


In [48]:
cv3 = CountVectorizer(stop_words=stop_words,ngram_range=(3,3))
data_cv3 = cv3.fit_transform(data_clean.lemat)
data_dtm3 = pd.DataFrame(data_cv3.toarray(), columns=cv3.get_feature_names())
data_dtm3.index = data_clean.index
data_dtm3

Unnamed: 0,abide agreed say,ability lock know,ability money able,ability take million,ability wealth well,able afford car,able breathe know,able bring cost,able call ppp,able charge woman,...,zero emission first,zero emission fracking,zero ether talking,zero term energy,zone good relationship,zone got criminal,zone one successful,zone south china,zone tim scott,zone took care
Biden_I,0,0,1,1,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
Trump_I,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Biden_II,1,0,0,0,1,0,1,0,1,0,...,1,1,1,0,1,0,0,1,0,0
Trump_II,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,1
Biden,1,0,1,1,1,0,1,1,1,1,...,1,1,1,1,1,0,0,1,0,0
Trump,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,1,0,1,1


## Export data

In [49]:
# all_part.to_pickle("data/all_part_for_sentiment.pkl")

In [50]:
# clean_part.to_pickle("data/Debate_part_clean.pkl")

In [51]:
# data for sentiment
# data_df.to_pickle("data/all_for_sentiment.pkl")

In [52]:
# data_dtm.to_pickle("data/Debate_DTM_Matrix_1.pkl")

In [53]:
# data_dtm2.to_pickle("data/Debate_DTM_Matrix_2gram.pkl")

In [54]:
# data_dtm3.to_pickle("data/Debate_DTM_Matrix_3gram.pkl")

In [55]:
# data_clean.to_pickle('data/Debate_data_clean.pkl')
# pickle.dump(cv, open("data/Stop_Words_sklearn_cv.pkl", "wb"))