# Apply the doc2vec algorithm from Le and Mikolov (2014)

### Set-up

In [11]:
import glob
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re
import string  

### preprocessing

In [12]:
# create an stemmer object
ps= PorterStemmer() 
# list of stop-words 
STOPWORDS = set(stopwords.words('english'))
### create punctuation objects
mypunct = """!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“”’…'"""

### stopwords

In [13]:
### add more stopwords so that i'm and im (or you'd and youd) would be in the list
stop_list = []
for stop in STOPWORDS:
    if "'" in stop:
        stop_list.append(re.sub("'", "", stop))
### update
STOPWORDS.update(stop_list)

### Import 62 text files (60 ads and 2 aggregated public speech data)

In [14]:
### locations where text files are saved
folder = 'ALL_ADS_AND_SPEECHES_FINAL'
file_loc = 'C:\\Users\\donggwan.kim\\Desktop\\Video_Transcribing_Final_corrected\\' + folder + '\\*.txt'
file_paths = glob.glob(file_loc)
print(len(file_paths))

### 
name_list = []
string_list = []
for file in file_paths:
    # get the filename
    file_name = [name[0:8] for name in file.split('\\') if name[0].isdigit()][0]
    # append it to the name_list
    name_list.append(file_name)    
    # open the text file
    with open(file, encoding="utf8") as f:
        lines = f.readlines()
    # concat each line and create one single string
    string_concat = ''
    for line in lines:
        string_concat = string_concat + ' ' + line
    # clean the string
    string_concat = string_concat[1:]
    string_concat = string_concat.replace("—", " ")
    string_concat = string_concat.replace("-", " ")
    # initial pre-processing
    outcome = string_concat.lower() # lower case
    outcome = outcome.strip() # remove some weird whitespaces
    outcome = outcome.replace("\n", "") # remove \n new lines
    outcome = re.sub(r'\d+', '', outcome) # remove numbers
    # remove punctiations
    outcome = re.sub('['+mypunct+']', '', outcome)
    # remove extra spaces in strings
    outcome = re.sub(r' +', ' ', outcome)
    # tokenization
    Tokenized = [word for word in outcome.split(' ') if (len(word) >= 2)]
    # Remove numbers
    Tokenized = [word for word in Tokenized if not word.isdigit()]
    # stem
    Token_Stem = [ps.stem(word) for word in Tokenized]
    # final outcome
    result = ' '.join(Token_Stem)
    # append to string_list
    string_list.append(result)

62


### tagging

In [16]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in zip(name_list, string_list)]

### Training the model

In [18]:
vec_size = 200
alpha = 0.0025
model = Doc2Vec(vector_size = vec_size, 
                min_count = 3, 
                window = 5, 
                negative = 5,
                sample = 1e-2, 
                epochs = 300)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")
print("Model Saved")

Model Saved


### create a table

In [22]:
col_names = [str(num) for num in range(vec_size)]
# create an empty table
df_vector = pd.DataFrame(columns = col_names, index = np.arange(len(name_list)))
# fill the table
name_list_2 = [int(name) for name in name_list]
for i in range(len(name_list)):
    #vec = model.docvecs[name_list[i]]
    df_vector.iloc[int(i)] = model.docvecs[name_list[i]]
df_vector['FILE_NUM_RA_CODING'] = name_list_2
df_vector.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,FILE_NUM_RA_CODING
0,2.226,1.23265,3.26522,2.01937,-4.30425,1.06146,0.189681,0.856751,1.16228,0.311134,...,-1.93005,-1.49484,-0.861104,0.925943,1.92489,-0.827523,0.406514,0.679727,1.85692,16667355
1,0.981825,-0.982362,0.907243,1.0505,0.829617,-0.419295,0.595089,-2.21098,2.00307,1.31757,...,0.311418,0.480255,1.0363,0.551928,-1.67862,-0.310181,-1.6797,0.740321,-1.41946,16667854
2,0.144669,0.931877,1.98057,0.0250422,-2.21809,-0.269239,-0.767738,1.0454,2.21797,0.423852,...,-0.13085,-0.188902,-1.21657,1.89988,-1.2813,0.168974,1.56463,1.59248,-2.36729,16668348
3,2.31907,0.800541,0.0715181,3.1806,0.723967,-0.289595,-1.2937,-0.825697,1.81298,-0.151762,...,-1.64074,1.39492,-0.11056,2.09266,0.85668,0.557391,-0.226844,0.888729,-2.0393,16695900
4,-0.80005,0.0364885,0.411981,1.77444,-2.06352,-0.798525,2.1172,-2.13095,3.06475,2.18534,...,-1.06533,0.581341,2.15495,-0.884233,1.38538,-1.15336,-1.04229,0.461974,0.454172,16707654


### save the table

In [25]:
csv_name = 'C:\\Users\\donggwan.kim\\Box\\Documents\\Dropbox_RT_New\\doc2vec_w_200.csv'
df_vector.to_csv(csv_name)

C:\Users\donggwan.kim\Box\Documents\Dropbox_RT_New\doc2vec_w_200.csv
