In [1]:
import gensim
import pandas as pd
import numpy as np
import sys
import os

import sklearn
from sklearn import cross_validation
from sklearn import svm
from sklearn import metrics

from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding("UTF-8")

In [2]:
def clean_text(raw_text):
    review_text = BeautifulSoup(raw_text).get_text()
    words = review_text.lower().split()
    return(" ".join(words))

In [3]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    #
    # Index2word is a list that contains the names of the words in
    # the model's vocabulary. Convert it to a set, for speed
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    #
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [4]:
def getAvgFeatureVecs(all_texts, model, num_features):
    # Given a set of reviews (each one a list of words), calculate
    # the average feature vector for each one and return a 2D numpy array
    #
    # Initialize a counter
    counter = 0.
    #
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(all_texts),num_features),dtype="float32")
    #
    # Loop through the reviews
    for one_line in all_texts:
       reviewFeatureVecs[counter] = makeFeatureVec(one_line, model, num_features)
       #
       # Increment the counter
       counter = counter + 1.
    return reviewFeatureVecs

In [5]:
# Getting a Glove Model
model_file = "new.glove.twitter.27B.25d.txt"
num_features = 25
model = gensim.models.Word2Vec.load_word2vec_format(model_file,binary=False)

In [6]:
languages = ["english"]
datafiles = ["summary-english-truth.txt"]
tasks = ["age"]

for language, datafile in zip(languages, datafiles):
    train = pd.read_csv(datafile, header=0, delimiter="\t", quoting=1)
    num_text = train["text"].size
    clean_train_data = []
    
    for i in xrange( 0, num_text):
        clean_train_data.append( clean_text( train["text"][i] ) )
        
trainDataVecs = getAvgFeatureVecs( clean_train_data, model, num_features )

In [7]:
trainDataVecs.shape

(152, 25)

In [8]:
targetVec = train["age"]

In [9]:
degrees = [1,2,3]
C = [10**-3, 1, 10**3]

In [10]:
list_of_scores = []
for degree in degrees:
    for one_C in C:
        clf = svm.SVC(kernel='poly', degree=degree, coef0=one_C, gamma=1)
        scoring_function = 'accuracy'
        scores = cross_validation.cross_val_score(clf, trainDataVecs, targetVec, cv=10, scoring=scoring_function)
        list_of_scores.append(scores)

In [11]:
list_of_scores

[array([ 0.35294118,  0.35294118,  0.4       ,  0.4       ,  0.4       ,
         0.4       ,  0.4       ,  0.4       ,  0.42857143,  0.42857143]),
 array([ 0.35294118,  0.35294118,  0.4       ,  0.4       ,  0.4       ,
         0.4       ,  0.4       ,  0.4       ,  0.42857143,  0.42857143]),
 array([ 0.35294118,  0.35294118,  0.4       ,  0.4       ,  0.4       ,
         0.4       ,  0.4       ,  0.4       ,  0.42857143,  0.42857143]),
 array([ 0.35294118,  0.35294118,  0.4       ,  0.4       ,  0.4       ,
         0.4       ,  0.4       ,  0.4       ,  0.42857143,  0.42857143]),
 array([ 0.35294118,  0.35294118,  0.4       ,  0.4       ,  0.4       ,
         0.4       ,  0.4       ,  0.4       ,  0.42857143,  0.42857143]),
 array([ 0.52941176,  0.64705882,  0.73333333,  0.73333333,  0.46666667,
         0.6       ,  0.66666667,  0.66666667,  0.42857143,  0.71428571]),
 array([ 0.35294118,  0.35294118,  0.4       ,  0.4       ,  0.4       ,
         0.4       ,  0.4       ,  0.4 

In [12]:
avg = [x.mean() for x in list_of_scores]
avg

[0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.61859943977591036,
 0.39630252100840335,
 0.39630252100840335,
 0.60605042016806732]

In [26]:
gammas = [1e-5, 1e-4, 1e-3, 1, 1e3, 1e4, 1e5]
C = [1e-5, 1e-4, 1e-3, 1, 1e3, 1e4, 1e5]

In [27]:
list_of_scores = []
for g in gammas:
    for one_C in C:
        clf = svm.SVC(kernel='rbf', gamma=g, C=one_C)
        scoring_function = 'accuracy'
        scores = cross_validation.cross_val_score(clf, trainDataVecs, targetVec, cv=10, scoring=scoring_function)
        list_of_scores.append(scores)

In [28]:
avg = [x.mean() for x in list_of_scores]
avg

[0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.41011204481792712,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.41011204481792712,
 0.57921568627450981,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.62574229691876748,
 0.6508403361344538,
 0.61686274509803918,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.60016806722689087,
 0.66008403361344536,
 0.66008403361344536,
 0.66008403361344536,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.64826330532212884,
 0.72067226890756297,
 0.72067226890756297,
 0.72067226890756297,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252100840335,
 0.39630252