In [69]:
import gensim
import pandas as pd
import numpy as np
import sys
import os
import itertools

import sklearn
from sklearn import cross_validation
from sklearn import svm
from sklearn import metrics

from scipy import stats
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding("UTF-8")

In [70]:
def clean_text(raw_text):
    review_text = BeautifulSoup(raw_text).get_text()
    words = review_text.lower().split()
    return(" ".join(words))

In [71]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    #
    # Index2word is a list that contains the names of the words in
    # the model's vocabulary. Convert it to a set, for speed
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    #
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [72]:
def getAvgFeatureVecs(all_texts, model, num_features):
    # Given a set of reviews (each one a list of words), calculate
    # the average feature vector for each one and return a 2D numpy array
    #
    # Initialize a counter
    counter = 0.
    #
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(all_texts),num_features),dtype="float32")
    #
    # Loop through the reviews
    for one_line in all_texts:
       reviewFeatureVecs[counter] = makeFeatureVec(one_line, model, num_features)
       #
       # Increment the counter
       counter = counter + 1.
    return reviewFeatureVecs

In [73]:
# Getting a Glove Model
model_files = ["new.glove.twitter.27B.25d.txt", "new.glove.twitter.27B.50d.txt", 
               "new.glove.twitter.27B.100d.txt", "new.glove.twitter.27B.200d.txt"]
list_of_num_features = [25, 50, 100, 200]

languages = ["english"]
datafiles = ["summary-english-truth.txt"]
tasks = ["age", "gender"]
scoring_function = 'accuracy'
all_results = {}

In [74]:
for model_file, num_features in zip(model_files, list_of_num_features):
    model = gensim.models.Word2Vec.load_word2vec_format(model_file,binary=False)

    for language, datafile in zip(languages, datafiles):
        train = pd.read_csv(datafile, header=0, delimiter="\t", quoting=1)
        num_text = train["text"].size
        clean_train_data = []
    
        for i in xrange( 0, num_text):
            clean_train_data.append( clean_text( train["text"][i] ) )
        
        trainDataVecs = getAvgFeatureVecs( clean_train_data, model, num_features )
        
        for task in tasks:
            targetVec = train[task]
            
            #polynomial
            degrees = [1,2,3]
            C = [10**-3, 1, 10**3]
            
            poly_list_of_scores = []
            for degree in degrees:
                for one_C in C:
                    clf = svm.SVC(kernel='poly', degree=degree, coef0=one_C, gamma=1)
                    scores = cross_validation.cross_val_score(clf, trainDataVecs, targetVec, cv=10, scoring=scoring_function)
                    poly_list_of_scores.append(scores)
                    
                    dict_key = "dims={} task={} kernel={} degree={} C={}".format(num_features, task, "poly", degree, one_C)
                    all_results[dict_key] = scores
            #rbf
            gammas = [1e-5, 1e-4, 1e-3, 1, 1e3, 1e4, 1e5]
            C = [1e-5, 1e-4, 1e-3, 1, 1e3, 1e4, 1e5]
            
            rbf_list_of_scores = []
            for g in gammas:
                for one_C in C:
                    clf = svm.SVC(kernel='rbf', gamma=g, C=one_C)
                    scores = cross_validation.cross_val_score(clf, trainDataVecs, targetVec, cv=10, scoring=scoring_function)
                    rbf_list_of_scores.append(scores)

                    dict_key = "dims={} task={} kernel={} gamma={} C={}".format(num_features, task, "rbf",g, one_C)
                    all_results[dict_key] = scores

In [75]:
avg = []
for one_key in all_results.keys():
    x = all_results[one_key]
    one_avg = x.mean()
    avg.append(one_avg)
a = max(avg)
a

0.76964285714285707

In [76]:
def getAccuracies(all_results):
    accuracies_dictionary = {}
    for each_key in all_results.keys():
        list_of_accuracies = all_results[each_key]
        accuracies = list_of_accuracies.mean()
        accuracies_dictionary[each_key] = accuracies
    return accuracies_dictionary

In [77]:
accuracies_dictionary = getAccuracies(all_results)
all_keys = accuracies_dictionary.keys()
all_keys.sort()

In [92]:
def makePValMatrix(all_results, sorted_keys):
    list_length = len(sorted_keys)
    p_value_matrix = np.zeros((list_length, list_length))
    i = range(0, list_length)
    #sig values
    
    for key_1, x in zip(sorted_keys, i):
        for key_2, y in zip(sorted_keys, i):
            treatment_1 = all_results[key_1]
            treatment_2 = all_results[key_2]
            z_stat, p_val = stats.ranksums(treatment_1, treatment_2)
            p_value_matrix[x,y] = p_val
    return p_value_matrix

In [87]:
p_value_matrix = makePValMatrix(all_results, all_keys)

array([[  1.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          2.12182871e-04,   2.12182871e-04,   2.12182871e-04],
       [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          2.12182871e-04,   2.12182871e-04,   2.12182871e-04],
       [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          2.12182871e-04,   2.12182871e-04,   2.12182871e-04],
       ..., 
       [  2.12182871e-04,   2.12182871e-04,   2.12182871e-04, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       [  2.12182871e-04,   2.12182871e-04,   2.12182871e-04, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       [  2.12182871e-04,   2.12182871e-04,   2.12182871e-04, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00]])

(464, 464)

In [89]:
x = [all_results[a] for a in all_keys]
list_of_accuracies = [a.mean() for a in x]

In [90]:
def turnPValMatrixToExcel(fileName, p_value_matrix, list_of_accuracies):
    df = pd.DataFrame(data = p_value_matrix, columns=list_of_accuracies)
    df.index = list_of_accuracies
    null_disproved = df[df < 0.05]
    null_disproved.to_csv(fileName, sep='\t', encoding='utf-8')

In [83]:
turnPValMatrixToExcel("test_matrix.csv", p_value_matrix, list_of_accuracies)
# not such a good idea. very big matrix. compares all in computer but difficult for person to visualize

In [106]:
def writeAvgAccuracies(fileName, accuracy_matrix, degrees, C):
    df = pd.DataFrame(data = accuracy_matrix, columns=degrees)
    df.index = C
    df.to_csv(fileName, sep='\t', encoding='utf-8')

In [124]:
for num_features in list_of_num_features:
    for task in tasks:
        
        kernel = "poly"
        degrees = [1,2,3]
        C = [10**-3, 1, 10**3]
        sorted_keys = ["dims={} task={} kernel={} degree={} C={}".format(num_features, task, "poly", degree, one_C) for degree, one_C in list(itertools.product(degrees, C))]
        sorted_keys.sort()
        
        p_value_matrix = makePValMatrix(all_results, sorted_keys)
        fileName = "avg-null-disproved-pval-dims{}-task-{}-kernel-{}.csv".format(num_features, task, kernel)
        list_of_accuracies = [all_results[a].mean() for a in sorted_keys]
        turnPValMatrixToExcel(fileName, p_value_matrix, sorted_keys)
        
        rows = len(C)
        cols = len(degrees)
        accuracy_matrix = np.zeros((rows, cols))
        
        i = range(0, rows)
        j = range(0, cols)
        
        for degree, x in zip(degrees, i):
            for one_C, y in zip(C, j):
                key = "dims={} task={} kernel={} degree={} C={}".format(num_features, task, "poly", degree, one_C)
                accuracy_matrix[x, y] = all_results[key].mean()
        
        fileName = "accuracy-avg-null-disproved-pval-dims{}-task-{}-kernel-{}.csv".format(num_features, task, kernel)
        writeAvgAccuracies(fileName, accuracy_matrix, degrees, C)
        
        kernel = "rbf"
        gammas = [1e-5, 1e-4, 1e-3, 1, 1e3, 1e4, 1e5]
        C = [1e-5, 1e-4, 1e-3, 1, 1e3, 1e4, 1e5]
        
        sorted_keys = ["dims={} task={} kernel={} gamma={} C={}".format(num_features, task, "rbf", g, one_C) for g, one_C in list(itertools.product(gammas, C))]
        sorted_keys.sort()
        
        p_value_matrix = makePValMatrix(all_results, sorted_keys)
        fileName = "avg-null-disproved-pval-dims{}-task-{}-kernel-{}.csv".format(num_features, task, kernel)
        list_of_accuracies = [all_results[a].mean() for a in sorted_keys]
        turnPValMatrixToExcel(fileName, p_value_matrix, sorted_keys)
        
        rows = len(C)
        cols = len(gammas)
        accuracy_matrix = np.zeros((rows, cols))
        
        i = range(0, rows)
        j = range(0, cols)
        
        for g, x in zip(gammas, i):
            for one_C, y in zip(C, j):
                key = "dims={} task={} kernel={} gamma={} C={}".format(num_features, task, "rbf", g, one_C)
                accuracy_matrix[x, y] = all_results[key].mean()
        
        fileName = "accuracy-avg-null-disproved-pval-dims{}-task-{}-kernel-{}.csv".format(num_features, task, kernel)
        writeAvgAccuracies(fileName, accuracy_matrix, gammas, C)


In [116]:
num_features=25
task="age"
kernel = "poly"
degrees = [1,2,3]
C = [10**-3, 1, 10**3]
sorted_keys = ["dims={} task={} kernel={} degree={} C={}".format(num_features, task, "poly", degree, one_C) for degree, one_C in list(itertools.product(degrees, C))]
sorted_keys.sort()
        
p_value_matrix = makePValMatrix(all_results, sorted_keys)
fileName = "avg-null-disproved-pval-dims{}-task-{}-kernel-{}.csv".format(num_features, task, kernel)
list_of_accuracies = [all_results[a].mean() for a in sorted_keys]
turnPValMatrixToExcel(fileName, p_value_matrix, sorted_keys)
        
rows = len(C)
cols = len(degrees)
accuracy_matrix = np.zeros((rows, cols))
        
i = range(0, rows)
j = range(0, cols)
        
for degree, x in zip(degrees, i):
    for one_C, y in zip(C, j):
        key = "dims={} task={} kernel={} degree={} C={}".format(num_features, task, "poly", degree, one_C)
        accuracy_matrix[x, y] = all_results[key].mean()
        
fileName = "accuracy-avg-null-disproved-pval-dims{}-task-{}-kernel-{}.csv".format(num_features, task, kernel)
writeAvgAccuracies(fileName, accuracy_matrix, degrees, C)

TypeError: zip argument #2 must support iteration

In [120]:
[type(degrees), type(C), type(x), type(y)]

[list, list, int, int]

In [121]:
x=range(0, 3)
type(x)

list

In [118]:
for degree,x in zip(degrees, x):
    for one_C,y in zip(C, y):
        (degree, x, one_C, y)

TypeError: zip argument #2 must support iteration