# Apply the doc2vec algorithm from Le and Mikolov (2014)

### Set-up

In [26]:
import glob
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re
import string  

### preprocessing

In [27]:
# create an stemmer object
ps= PorterStemmer() 
# list of stop-words 
STOPWORDS = set(stopwords.words('english'))
### create punctuation objects
mypunct = """!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“”’…'"""

### stopwords

In [28]:
### add more stopwords so that i'm and im (or you'd and youd) would be in the list
stop_list = []
for stop in STOPWORDS:
    if "'" in stop:
        stop_list.append(re.sub("'", "", stop))
### update
STOPWORDS.update(stop_list)

### Import 62 text files (60 ads and 2 aggregated public speech data)
### 60 ad creatives were downloaded from Kantar Media Stradegy
### 2 aggregated publich speech data (during the primaries) are downloaded from American Presidency Project

In [32]:
### locations where text files are saved
folder = 'ALL_ADS_AND_SPEECHES_FINAL'
file_loc = 'C:\\Users\\donggwan.kim\\Desktop\\Video_Transcribing_Final_corrected\\' + folder + '\\*.txt'
file_paths = glob.glob(file_loc)
print(len(file_paths))

### 
name_list = []
string_list = []
for file in file_paths:
    # get the filename
    file_name = [name[0:8] for name in file.split('\\') if name[0].isdigit()][0]
    # append it to the name_list
    name_list.append(file_name)    
    # open the text file
    with open(file, encoding="utf8") as f:
        lines = f.readlines()
    # concat each line and create one single string
    string_concat = ''
    for line in lines:
        string_concat = string_concat + ' ' + line
    # clean the string
    string_concat = string_concat[1:]
    string_concat = string_concat.replace("—", " ")
    string_concat = string_concat.replace("-", " ")
    # initial pre-processing
    outcome = string_concat.lower() # lower case
    outcome = outcome.strip() # remove some weird whitespaces
    outcome = outcome.replace("\n", "") # remove \n new lines
    outcome = re.sub(r'\d+', '', outcome) # remove numbers
    # remove punctiations
    outcome = re.sub('['+mypunct+']', '', outcome)
    # remove extra spaces in strings
    outcome = re.sub(r' +', ' ', outcome)
    # tokenization
    Tokenized = [word for word in outcome.split(' ') if (len(word) >= 2)]
    # Remove numbers
    Tokenized = [word for word in Tokenized if not word.isdigit()]
    # stem
    Token_Stem = [ps.stem(word) for word in Tokenized]
    # final outcome
    result = ' '.join(Token_Stem)
    # append to string_list
    string_list.append(result)

62


### tagging

In [33]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in zip(name_list, string_list)]

### Training the model

In [35]:
vec_size = 200
alpha = 0.0025
model = Doc2Vec(vector_size = vec_size, 
                min_count = 3, 
                window = 5, 
                negative = 5,
                sample = 1e-2, 
                epochs = 300)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

### create a table

In [36]:
col_names = [str(num) for num in range(vec_size)]
# create an empty table
df_vector = pd.DataFrame(columns = col_names, index = np.arange(len(name_list)))
# fill the table
name_list_2 = [int(name) for name in name_list]
for i in range(len(name_list)):
    #vec = model.docvecs[name_list[i]]
    df_vector.iloc[int(i)] = model.docvecs[name_list[i]]
df_vector['FILE_NUM_RA_CODING'] = name_list_2

### save the table

In [37]:
csv_name = 'C:\\Users\\donggwan.kim\\Desktop\\doc2vec_w_200.csv'
df_vector.to_csv(csv_name)