In [1]:
import pandas as pd
import numpy as np
import gensim

# get embeddings

In [2]:
#FastText
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(tokens[1:])
    return data

vec_wiki = load_vectors('wiki-news-300d-1M.vec')
vec_crawl = load_vectors('crawl-300d-2M.vec')

In [3]:
#Glove
vec_glove = {}
with open('glove.6B.300d.txt', 'r') as file:
    for line in file:
        elements = line.split()
        word = elements[0]
        embeddings = np.array(elements[1:])
        
        vec_glove[word] = embeddings

In [4]:
#Google

vec_google = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

#google.word_vec("dog")

# dataset

In [5]:
data = pd.read_excel('mit_final.xlsx', index_col=0)
data['all'] = data['intro'] + " " + data['leadership'] + " " + data['challenge'] + " " + data['weakness'] + " " + data['whyhire']

In [6]:
ratings = pd.read_csv('turker_scores_full_interview.csv', index_col=0)
ratings['Participant'] = ratings.index
ratings = ratings[ratings['Worker'] == 'AGGR']

In [7]:
data=pd.concat([data, ratings], axis=1)

# get x and y

In [9]:
def get_x(data, column_question):
    
    from nltk.corpus import stopwords
    import string
    
    sws = ['pretty', 'uhm', 'uhmm', 'hmmm', 'uhmmm', 'um', 'umm', 'ummm', 'ummmm', 'mmmmmm', 'uh', 'uhh', 'uhhh', 'ah', 
          'ahh', 'ahhh', 'ok', 'interviewee', 'okay', 'yeah', 'inaudible', 'hmm',
          'laughs', 'alright', 'well', 'heh', 'oh', 'ohh', 'ohhh', 'hm', 'hmm', 'hmmmm', 'yea', 'yes', 'yeah']
#     sws = sw + stopwords.words('english')

    import re
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    regex2 = re.compile('interviewee')
    
    x = data[column_question].values    
    x_clean = []
    
    for line in x:
        line = str(line).lower()
        line = line.encode("ascii", "ignore")
        line = line.decode()

        line = regex.sub('', line)
        line = regex2.sub('', line)
        
        line_temp = []
        for token in line.rstrip().split():
            if token not in sws:
                line_temp.append(token)
        x_clean.append(" ".join(line_temp))
        

        
    return x_clean


In [10]:
y_overall = data['Overall'].values
y_hire = data['RecommendHiring'].values
y_colleague = data['Colleague'].values
y_eng = data['Engaged'].values
y_excit = data['Excited'].values
y_eye = data['EyeContact'].values
y_smiled = data['Smiled'].values
y_rate = data['SpeakingRate'].values
y_nofiller= data['NoFillers'].values
y_friendly = data['Friendly'].values
y_paused = data['Paused'].values
y_engtone = data['EngagingTone'].values
y_str = data['StructuredAnswers'].values
y_calm = data['Calm'].values
y_notstress = data['NotStressed'].values
y_focused = data['Focused'].values
y_auth = data['Authentic'].values
y_notawk = data['NotAwkward'].values
y_total = data['Total'].values

In [11]:
x_i = get_x(data, 'intro')
x_l = get_x(data, 'leadership')
x_c = get_x(data, 'challenge')
x_w = get_x(data, 'weakness')
x_h = get_x(data, 'whyhire')
x_all = get_x(data, 'all')

# vectorize

In [12]:
def vectorize_avg(embedding, x):
    
    vecs = []
    for row in x:
        temp = []
        for token in row.strip().split():
            if token in embedding.keys():
                temp.append(embedding[token])
        vec = np.array(temp).astype(np.float)
        avg = np.mean(vec, axis=0, dtype='float32')
        vecs.append(avg)
    
    return np.array(vecs)

def vectorize_sum(embedding, x):
    
    vecs = []
    for row in x:
        temp = []
        for token in row.strip().split():
            if token in embedding.keys():
                temp.append(embedding[token])
        vec = np.array(temp).astype(np.float)
        avg = np.sum(vec, axis=0, dtype='float32')
        vecs.append(avg)
    
    return np.array(vecs)


def vectorize_avg_google(embedding, x):
    
    vecs = []
    for row in x:
        temp = []
        for token in row.strip().split():
            if token in embedding:
                temp.append(embedding.word_vec(token))
        vec = np.array(temp).astype(np.float)
        avg = np.mean(vec, axis=0, dtype='float32')
        vecs.append(avg)
    
    return np.array(vecs)

def vectorize_sum_google(embedding, x):
    
    vecs = []
    for row in x:
        temp = []
        for token in row.strip().split():
            if token in embedding:
                temp.append(embedding.word_vec(token))
        vec = np.array(temp).astype(np.float)
        avg = np.sum(vec, axis=0, dtype='float32')
        vecs.append(avg)
    
    return np.array(vecs)

## vectors - summed and averaged

In [13]:
# x = x_all
# vecs_sum_glove = vectorize_sum(vec_glove, x)
# vecs_sum_wiki = vectorize_sum(vec_wiki, x)
# vecs_sum_crawl = vectorize_sum(vec_crawl, x)
# vecs_sum_google = vectorize_sum_google(vec_google, x)

In [14]:
x = x_all
vecs_avg_glove = vectorize_avg(vec_glove, x)
vecs_avg_wiki = vectorize_avg(vec_wiki, x)
vecs_avg_crawl = vectorize_avg(vec_crawl, x)
vecs_avg_google = vectorize_avg_google(vec_google, x)

In [15]:
# vectors = [vecs_sum_glove, vecs_sum_wiki, vecs_sum_crawl, vecs_sum_google]
vectors = [vecs_avg_glove, vecs_avg_wiki, vecs_avg_crawl, vecs_avg_google]

In [16]:
ys = [y_overall, y_hire, y_excit, y_eye, y_smiled, y_rate, y_nofiller, y_friendly,
      y_paused, y_engtone, y_str, y_calm, y_notstress, y_focused, y_auth, y_notawk, y_total]

# Build Models

In [17]:
from sklearn.linear_model import Lasso 
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [18]:
# for hyperparameters tuning

# params = {'alpha':[0.0001, 0.001, 0.01, 1, 10], 'tol':[0.1, 1, 10]}
# params2 = {'gamma': ('scale', 'auto'), 'epsilon':[0.0001, 0.001, 0.01, 1, 10], 'tol':[0.01, 0.1, 1, 10], 'C':[1, 10, 100, 1000]}

# model4 = Lasso()
# model5 = SVR()

# g = GridSearchCV(model4, params, return_train_score=True, cv=5)
# g2 = GridSearchCV(model5, params2, return_train_score=True, cv=5)

In [19]:
#models
lasso = Lasso(alpha=0.001, tol=10)
svr = SVR(gamma='scale')
ridge = Ridge(alpha=10, tol=0.001, normalize=True)

# Run Models

## lasso, svr, ridge (k-fold)

In [20]:
#lasso

alles = {}
k = 5
unit = len(x) // k
for i in range(k):
    results = {}
    for k, x in enumerate(vectors):
        for j, y in enumerate(ys):
            y = np.array(y)
            x_test = x[i*unit:(i+1)*unit]
            y_test = y[i*unit:(i+1)*unit]
            x_train = np.concatenate((x[:i*unit], x[(i+1)*unit:]), axis=0)
            y_train = np.concatenate((y[:i*unit], y[(i+1)*unit:]), axis=0)
            lasso.fit(x_train, y_train)
            preds = lasso.predict(x_test)
            result = np.corrcoef(y_test, preds)
            results[str(k+1)+"_"+str(j+1)] = result
    alles[i] = results

final1 = {}
for k, v in alles.items():
    for k2, v2 in v.items():
        avg = np.mean(v2)
        final1[k2] = avg

print(final1)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


{'1_1': 0.6091864966731233, '1_2': 0.5485187806665622, '1_3': 0.7204331726318829, '1_4': 0.6193490920096315, '1_5': 0.7658300705208396, '1_6': 0.541180544086415, '1_7': 0.6623614772798931, '1_8': 0.7079165026599213, '1_9': 0.7339118345971227, '1_10': 0.7397803639794881, '1_11': 0.6827666064581994, '1_12': 0.7671805577630161, '1_13': 0.598438427896443, '1_14': 0.5296393896006709, '1_15': 0.5495598619123399, '1_16': 0.6438146714614863, '1_17': 0.617901699155902, '2_1': 0.5500167463812615, '2_2': 0.5468472915951086, '2_3': 0.6760105515429495, '2_4': 0.5841426121597162, '2_5': 0.6582998559429718, '2_6': nan, '2_7': 0.5787479929480545, '2_8': 0.7075180964103138, '2_9': 0.6091094580441366, '2_10': 0.6537886430020197, '2_11': 0.6633933819361986, '2_12': 0.6821706323291968, '2_13': 0.5919257972615728, '2_14': 0.6791107812766362, '2_15': nan, '2_16': 0.5457906795749339, '2_17': 0.5480744545173952, '3_1': 0.703456156814745, '3_2': 0.6867361269337753, '3_3': 0.6844819480043709, '3_4': 0.582468595

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


In [21]:
#svr

alles = {}
k = 5
unit = len(x) // k
for i in range(k):
    results = {}
    for k, x in enumerate(vectors):
        for j, y in enumerate(ys):
            y = np.array(y)
            x_test = x[i*unit:(i+1)*unit]
            y_test = y[i*unit:(i+1)*unit]
            x_train = np.concatenate((x[:i*unit], x[(i+1)*unit:]), axis=0)
            y_train = np.concatenate((y[:i*unit], y[(i+1)*unit:]), axis=0)
            svr.fit(x_train, y_train)
            preds = svr.predict(x_test)
            result = np.corrcoef(y_test, preds)
            results[str(k+1)+"_"+str(j+1)] = result
    alles[i] = results

final2= {}
for k, v in alles.items():
    for k2, v2 in v.items():
        avg = np.mean(v2)
        final2[k2] = avg

print(final2)

{'1_1': 0.5676865880123103, '1_2': 0.5786333312762887, '1_3': 0.6704498366342302, '1_4': 0.41660676966717536, '1_5': 0.5910592834513879, '1_6': 0.5123587662081883, '1_7': 0.5190146987516329, '1_8': 0.6026710171814031, '1_9': 0.5544636237239893, '1_10': 0.6503083677269328, '1_11': 0.6447549918667841, '1_12': 0.6216600955606363, '1_13': 0.5532854301442484, '1_14': 0.6307447905494028, '1_15': 0.5749814526432033, '1_16': 0.564806331837378, '1_17': 0.5252342834245997, '2_1': 0.5602808994302797, '2_2': 0.5793184930051287, '2_3': 0.68883713788827, '2_4': 0.43226753283952324, '2_5': 0.6291272246490831, '2_6': 0.4834161817402689, '2_7': 0.5332469546787648, '2_8': 0.6123107298620231, '2_9': 0.5766663393180788, '2_10': 0.6563369082843193, '2_11': 0.6327280066891665, '2_12': 0.660564461768413, '2_13': 0.5973131013056294, '2_14': 0.6359748775885603, '2_15': 0.6447911516371573, '2_16': 0.5579083939962695, '2_17': 0.5254586868434662, '3_1': 0.5978134903658403, '3_2': 0.6039705366323074, '3_3': 0.6955

In [22]:
#ridge

alles = {}
k = 5
unit = len(x) // k
for i in range(k):
    results = {}
    for k, x in enumerate(vectors):
        for j, y in enumerate(ys):
            y = np.array(y)
            x_test = x[i*unit:(i+1)*unit]
            y_test = y[i*unit:(i+1)*unit]
            x_train = np.concatenate((x[:i*unit], x[(i+1)*unit:]), axis=0)
            y_train = np.concatenate((y[:i*unit], y[(i+1)*unit:]), axis=0)
            ridge.fit(x_train, y_train)
            preds = ridge.predict(x_test)
            result = np.corrcoef(y_test, preds)
            results[str(k+1)+"_"+str(j+1)] = result
    alles[i] = results

final3 = {}
for k, v in alles.items():
    for k2, v2 in v.items():
        avg = np.mean(v2)
        final3[k2] = avg

print(final3)

{'1_1': 0.6591838613299403, '1_2': 0.6142053945190981, '1_3': 0.724268004889666, '1_4': 0.5620610437202472, '1_5': 0.728472947031644, '1_6': 0.5709518911389407, '1_7': 0.5834942897286327, '1_8': 0.6795578994304934, '1_9': 0.7147834777737073, '1_10': 0.7288225777719735, '1_11': 0.6746392305013041, '1_12': 0.7100198110111788, '1_13': 0.5931540020887531, '1_14': 0.531132911120908, '1_15': 0.6309980813407969, '1_16': 0.6314329221129144, '1_17': 0.614495575492715, '2_1': 0.6340230106340757, '2_2': 0.6087499110181064, '2_3': 0.7167031740888943, '2_4': 0.5941705324982481, '2_5': 0.6956796802708701, '2_6': 0.5849088349189391, '2_7': 0.581421414412054, '2_8': 0.6757099074753563, '2_9': 0.672817024607046, '2_10': 0.6908094438985702, '2_11': 0.6256424147691411, '2_12': 0.7319218812206777, '2_13': 0.634432472546396, '2_14': 0.5888780493156719, '2_15': 0.7024754530330855, '2_16': 0.6123261082964162, '2_17': 0.5515762327767753, '3_1': 0.661852354573099, '3_2': 0.6177063675377793, '3_3': 0.7343745193

In [23]:
df = pd.DataFrame([final1, final2, final3]).T
# df.to_csv('sg_all_avg.csv')

In [24]:
df

Unnamed: 0,0,1,2
1_1,0.609186,0.567687,0.659184
1_2,0.548519,0.578633,0.614205
1_3,0.720433,0.670450,0.724268
1_4,0.619349,0.416607,0.562061
1_5,0.765830,0.591059,0.728473
...,...,...,...
4_13,0.563865,0.580920,0.611922
4_14,0.597705,0.674073,0.644883
4_15,0.489022,0.595085,0.648324
4_16,0.619334,0.607189,0.674838
