In [53]:
import gensim
import pandas as pd
import numpy as np
import sys
import os
import itertools
import sklearn

from sklearn import cross_validation
from sklearn import svm
from sklearn import metrics
from scipy import stats
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding("UTF-8")

In [54]:
def clean_text(raw_text):
    review_text = BeautifulSoup(raw_text).get_text()
    words = review_text.lower().split()
    return(" ".join(words))

In [55]:
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    index2word_set = set(model.index2word)
    wordsNotInDict = []
    
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
        else:
            wordsNotInDict.append(word)
    featureVec = np.divide(featureVec,nwords)
    return featureVec, wordsNotInDict

In [56]:
def getAvgFeatureVecs(all_texts, model, num_features):
    counter = 0.
    reviewFeatureVecs = np.zeros((len(all_texts),num_features),dtype="float32")
    lineOfWordsNotInDict = []
    for one_line in all_texts:
       reviewFeatureVecs[counter], wordsNotInDict = makeFeatureVec(one_line, model, num_features)
       lineOfWordsNotInDict.append(wordsNotInDict)
       counter = counter + 1.
    return reviewFeatureVecs, lineOfWordsNotInDict

In [57]:
# Getting the model files: twitter-glove and wikipedia+gigaword-glove
model_files = [ "new.glove.twitter.27B.100d.txt", "new.glove.6B.100d.txt", 
                "new.glove.twitter.27B.200d.txt", "new.glove.6B.200d.txt"]
list_of_num_features = [100, 100, 200, 200]
sources = ["twitter", "wiki-giga", "twitter", "wiki-giga"]

languages = ["english"]
datafiles = ["summary-english-truth.txt"]
tasks = ["age", "gender"]
scoring_function = 'accuracy'
all_results = {}

# poly kernel params to check
poly_degrees = [1,2,3]
poly_C = [10**-1, 10, 10**3]

# rbf kernel params to check

rbf_gammas = [1, 0.001]
rbf_C = [10, 1000]

In [58]:
datafile = "summary-english-truth.txt"
languages = "english"

In [59]:
train = pd.read_csv(datafile, header=0, delimiter="\t", quoting=1)
num_text = train["text"].size
clean_train_data = []

In [60]:
for i in xrange( 0, num_text):
    clean_train_data.append( clean_text( train["text"][i] ) )

In [74]:
def doSVMwithPoly(trainDataVecs, targetVec, source, num_features, task, num_folds=10, degrees=[1,2,3], C=[10**-1, 10, 10**3] ):
    poly_results = {}
    for degree in degrees:
        for one_C in C:
            clf = svm.SVC(kernel='poly', degree=degree, coef0=one_C, gamma=1)
            scores = cross_validation.cross_val_score(clf, trainDataVecs, targetVec, cv=num_folds, scoring=scoring_function)
                    
            dict_key = "word2vec-source={} dims={} task={} kernel={} degree={} C={}".format(source, num_features, task, "poly", degree, one_C)
            poly_results[dict_key] = scores
    return poly_results

In [75]:
def doSVMwithRBF(trainDataVecs, targetVec, source, num_features, task, num_folds=10, gammas=[1, 0.001], C = [10, 1000]):
    rbf_results = {}
    for g in gammas:
        for one_C in C:
            clf = svm.SVC(kernel='rbf', gamma=g, C=one_C)
            scores = cross_validation.cross_val_score(clf, trainDataVecs, targetVec, cv=10, scoring=scoring_function)
            
            dict_key = "word2vec-source={} dims={} task={} kernel={} gamma={} C={}".format(source, num_features, task, "rbf",g, one_C)
            rbf_results[dict_key] = scores
    return rbf_results

In [76]:
def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z

In [80]:
all_results = {}
for model_file, num_features, source in zip(model_files, list_of_num_features, sources):
    model = gensim.models.Word2Vec.load_word2vec_format(model_file,binary=False)
    trainDataVecs, trashedWords = getAvgFeatureVecs( clean_train_data, model, num_features )
    
    for task in tasks:
        train_y = train[task]
        poly_results = doSVMwithPoly(trainDataVecs, train_y, source, num_features, task, num_folds=10, degrees=poly_degrees, C=poly_C)
        rbf_results = doSVMwithRBF(trainDataVecs, train_y, source, num_features, task, num_folds=10, gammas=rbf_gammas, C=rbf_C)
        results_one_task = merge_two_dicts(poly_results, rbf_results)
        all_results = merge_two_dicts(results_one_task, all_results)

In [84]:
all_results['word2vec-source=wiki-giga dims=100 task=gender kernel=poly degree=3 C=1000']

array([ 0.625     ,  0.75      ,  0.625     ,  0.6875    ,  0.625     ,
        0.625     ,  0.64285714,  0.92857143,  0.42857143,  0.78571429])

In [85]:
import pickle
pickle.dump( all_results, open( "word2vec-average-d100-d200.pkl", "wb" ) )

In [86]:
# TO OUTPUT FILES
# Part 1: One big excel file to do stat test between experiments

In [100]:
def getSortedKeys(all_results):
    sorted_keys = all_results.keys()
    sorted_keys.sort()
    return sorted_keys

In [101]:
def makePValMatrix(all_results):
    sorted_keys = getSortedKeys(all_results)    
    list_length = len(sorted_keys)
    p_value_matrix = np.zeros((list_length, list_length))
    i = range(0, list_length)
    #sig values
    
    for key_1, x in zip(sorted_keys, i):
        for key_2, y in zip(sorted_keys, i):
            treatment_1 = all_results[key_1]
            treatment_2 = all_results[key_2]
            z_stat, p_val = stats.ranksums(treatment_1, treatment_2)
            p_value_matrix[x,y] = p_val
    return p_value_matrix

In [102]:
def turnPValMatrixToExcel(fileName, all_results):
    p_value_matrix = makePValMatrix(all_results)
    sorted_keys = getSortedKeys(all_results)
    df = pd.DataFrame(data = p_value_matrix, columns=sorted_keys)
    df.index = sorted_keys
    null_disproved = df[df < 0.05]
    null_disproved.to_csv(fileName, sep=',', encoding='utf-8')

In [103]:
turnPValMatrixToExcel("all-null-disproved-p-values.csv", all_results)

In [None]:
# Part 2: Make an excel file for each source, dimension, kernel

In [124]:
def makeAvgAccuraciesMatrixPoly(all_results, num_features, source, task, kernel, top, left):
    if kernel == "poly":
        dict_string="word2vec-source={} dims={} task={} kernel={} degree={} C={}"
    else:
        dict_string="word2vec-source={} dims={} task={} kernel={} gamma={} C={}"
    
    rows = len(left)
    cols = len(top)
    accuracy_matrix = np.zeros((rows, cols))
    for i in range(rows):
        for j in range(cols):
            dict_name=dict_string.format(source, num_features, task, kernel, top[i],left[j])
            accuracy_matrix[i,j] = all_results[dict_name].mean()
            
    filename="word2vec-source={} dims={} task={} kernel={}".format(source, num_features,task, kernel)
    return accuracy_matrix, filename

In [128]:
def writeAvgAccuracies(fileName, accuracy_matrix, top, left):
    df = pd.DataFrame(data = accuracy_matrix, columns=top)
    df.index = left
    df.to_csv(fileName, sep=',', encoding='utf-8')

In [126]:
#x,y = makeAvgAccuraciesMatrixPoly(all_results, 200, "twitter", "age", "poly", poly_degrees, poly_C)

In [132]:
for num_features, source in zip(list_of_num_features, sources):
    for task in tasks:
        # do svm-poly
        accuracy_matrix, dict_name = makeAvgAccuraciesMatrixPoly(all_results, 
                                                                 num_features,
                                                                 source,
                                                                 task,
                                                                 "poly",
                                                                 poly_degrees,
                                                                 poly_C)        
        
        fileName=dict_name+".csv"
        writeAvgAccuracies(fileName, accuracy_matrix, poly_degrees, poly_C)
        # do svm-rbf
        accuracy_matrix, dict_name = makeAvgAccuraciesMatrixPoly(all_results, 
                                                                 num_features,
                                                                 source,
                                                                 task,
                                                                 "rbf",
                                                                 rbf_gammas,
                                                                 rbf_C)
        fileName=dict_name+".csv"
        writeAvgAccuracies(fileName, accuracy_matrix, rbf_gammas, rbf_C)