# Pregunta 4

### Lectura

In [1]:
import pandas as pd
import numpy as np
import tarfile
from urllib import urlretrieve

url = 'http://octopus.inf.utfsm.cl/~ricky/movies.tar.gz'

filename, headers = urlretrieve(url, 'movies.tar.gz')

In [2]:
tar = tarfile.open(filename)

dev_x = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/dev.x.mm"
test_x = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/test.x.mm"
dev_y = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/dev.y.dat"
test_y = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/test.y.dat"
train_x = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/train.x.mm"
train_y = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/train.y.dat"
vocab = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/vocab"

dev_x_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/dev.x.mm"
test_x_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/test.x.mm"
dev_y_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/dev.y.dat"
test_y_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/test.y.dat"
train_x_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/train.x.mm"
train_y_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/train.y.dat"
vocab_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/vocab"

for tarinfo in tar:

    if tarinfo.name == dev_x:
        d_x = tar.extractfile(tarinfo.name)
    if tarinfo.name == dev_y:
        d_y = tar.extractfile(tarinfo.name)
    if tarinfo.name == test_x:
        t_x = tar.extractfile(tarinfo.name)
    if tarinfo.name == test_y:
        t_y = tar.extractfile(tarinfo.name)
    if tarinfo.name == train_x:
        tr_x = tar.extractfile(tarinfo.name)
    if tarinfo.name == train_y:
        tr_y = tar.extractfile(tarinfo.name)
    if tarinfo.name == vocab:
        v = tar.extractfile(tarinfo.name)
        
    if tarinfo.name == dev_x_stars:
        d_x_s = tar.extractfile(tarinfo.name)
    if tarinfo.name == dev_y_stars:
        d_y_s = tar.extractfile(tarinfo.name)
    if tarinfo.name == test_x_stars:
        t_x_s = tar.extractfile(tarinfo.name)
    if tarinfo.name == test_y_stars:
        t_y_s = tar.extractfile(tarinfo.name)
    if tarinfo.name == train_x_stars:
        tr_x_s = tar.extractfile(tarinfo.name)    
    if tarinfo.name == train_y_stars:
        tr_y_s = tar.extractfile(tarinfo.name)
    if tarinfo.name == vocab_stars:
        v_s = tar.extractfile(tarinfo.name)

import scipy.io as sio
from scipy.sparse import csr_matrix


v_array = v.read().split("\n")
v_s_array = v_s.read().split("\n")

d_y_array = np.loadtxt(d_y)
t_y_array = np.loadtxt(t_y)
tr_y_array = np.loadtxt(tr_y)
d_y_s_array = np.loadtxt(d_y_s)
t_y_s_array = np.loadtxt(t_y_s)
tr_y_s_array = np.loadtxt(tr_y_s)

d_x_matrix = csr_matrix(sio.mmread(d_x))
t_x_matrix = csr_matrix(sio.mmread(t_x))
tr_x_matrix = csr_matrix(sio.mmread(tr_x))

d_x_s_matrix = csr_matrix(sio.mmread(d_x_s))
t_x_s_matrix = csr_matrix(sio.mmread(t_x_s))
tr_x_s_matrix = csr_matrix(sio.mmread(tr_x_s))

tar.close()

Es importante mantener el formato __sparse__ de las matrices, por un tema de optimización de memoria. Es necesario tener solo en memoria los datos no nulos de la matriz. Esto es significativo cuando se trabaja con matrices muy grandes, y donde muchos de los elementos son nulos. Asi, con un formato __sparse__, no hay necesidad de desperdiciar memoria en datos que no estan, haciendo mas eficiente el procesamiento de estas matrices.

### Modelo a utilizar
En esta ocación, el modelo a utilizar para modelar este problema es el _ElasticNet_, propuesto en el paper _"Movie Reviews and Revenues: An Experiment in Text Regression"_. Es una regresion lineal que combina las penalizaciones L1 y L2. Para un $\alpha = 0$, equivale a utilizar _Ridge_ y $\alpha = 1$ a _Lasso_. 

$$\theta = argmin \frac{1}{2n} \sum_{i=1}^{n}{(y_i-(\beta_0 + x^T_i\beta) )^2 + \lambda P(\beta)}$$
$$P(\beta) = \sum_{j=1}^p (\frac{1}{2} (1-\alpha) \beta_j^2+\alpha |\beta_j|)$$

Para estimar los mejores parametros, utilizando los datos de entrenamiento, se fueron variando los valores de $\alpha$ y $\lambda$, evaluando la funcion sobre los datos de desarrollo. A medida que se consige un mejor coeficiente de ajuste, se van guardando los paramentos.


In [6]:
from sklearn.linear_model import ElasticNet

def best_params_dev(b):
    alfa = np.logspace(-2, 1, num=15, base=10)
    model = ElasticNet(fit_intercept = False)
    best_a = 0
    best_b = 0
    best_coef = 0

    for a in alfa:
        model.set_params(alpha = a, l1_ratio = b)
        model.fit(tr_x_s_matrix, tr_y_s_array)
        coef = model.score(d_x_s_matrix, d_y_s_array)
        print '{:^15} {:^10} {:^15}'.format(a, b, coef)
        if best_coef < coef:
            best_coef = coef
            best_a = a
            best_b = b
    print "Para b: {0}, el mejor a: {1} con un coeficiente de {2}".format(best_b, best_a, best_coef)        
    
    return a,b

In [7]:
a0,b0 = best_params_dev(0)

     0.01           0      0.582868481063 
0.0163789370695     0      0.615779794337 
0.0268269579528     0      0.618761707432 
0.0439397056076     0      0.613739738906 
0.0719685673001     0      0.614459741224 
0.117876863479      0      0.617550671887 
0.193069772888      0      0.622107859447 
0.316227766017      0      0.621930441703 
0.517947467923      0      0.623057637723 
0.848342898244      0      0.624057426897 
 1.38949549437      0      0.623998436493 
 2.27584592607      0      0.625515761525 
 3.72759372031      0      0.623172288767 
 6.10540229659      0      0.619454577138 
     10.0           0      0.614623372903 
Para b: 0, el mejor a: 2.27584592607 con un coeficiente de 0.625515761525


In [8]:
a1,b1 = best_params_dev(0.2)

     0.01          0.2     0.559482367068 
0.0163789370695    0.2     0.612247854794 
0.0268269579528    0.2     0.617003201363 
0.0439397056076    0.2     0.615259681129 
0.0719685673001    0.2     0.612304009372 
0.117876863479     0.2     0.615820617868 
0.193069772888     0.2     0.619387446976 
0.316227766017     0.2     0.621865703316 
0.517947467923     0.2     0.621839621145 
0.848342898244     0.2      0.62343344878 
 1.38949549437     0.2      0.62374138437 
 2.27584592607     0.2     0.625014384564 
 3.72759372031     0.2     0.624660382142 
 6.10540229659     0.2     0.621220887345 
     10.0          0.2     0.617247327497 
Para b: 0.2, el mejor a: 2.27584592607 con un coeficiente de 0.625014384564


In [9]:
a2,b2 = best_params_dev(0.4)

     0.01          0.4     0.532996655498 
0.0163789370695    0.4     0.580575630501 
0.0268269579528    0.4     0.615687046704 
0.0439397056076    0.4     0.619519258249 
0.0719685673001    0.4     0.613985708999 
0.117876863479     0.4     0.614341822763 
0.193069772888     0.4      0.61744921449 
0.316227766017     0.4     0.621970165633 
0.517947467923     0.4     0.621921702096 
0.848342898244     0.4     0.622964935371 
 1.38949549437     0.4     0.624026389635 
 2.27584592607     0.4     0.623957623102 
 3.72759372031     0.4     0.625511889925 
 6.10540229659     0.4     0.623295132268 
     10.0          0.4     0.619595163601 
Para b: 0.4, el mejor a: 3.72759372031 con un coeficiente de 0.625511889925


In [10]:
a3,b3 = best_params_dev(0.6)

     0.01          0.6     0.514741536079 
0.0163789370695    0.6     0.541454765571 
0.0268269579528    0.6     0.591971128653 
0.0439397056076    0.6     0.615727997649 
0.0719685673001    0.6     0.615155602823 
0.117876863479     0.6     0.613054982867 
0.193069772888     0.6     0.614708238457 
0.316227766017     0.6     0.617984372786 
0.517947467923     0.6     0.622321075444 
0.848342898244     0.6     0.621921749737 
 1.38949549437     0.6     0.623312455783 
 2.27584592607     0.6      0.62407005118 
 3.72759372031     0.6     0.624194505955 
 6.10540229659     0.6     0.625459499423 
     10.0          0.6     0.622672271641 
Para b: 0.6, el mejor a: 6.10540229659 con un coeficiente de 0.625459499423


In [11]:
a4,b4 = best_params_dev(0.8)

     0.01          0.8     0.528393978425 
0.0163789370695    0.8     0.525089891789 
0.0268269579528    0.8     0.522146184865 
0.0439397056076    0.8     0.567511974424 
0.0719685673001    0.8     0.614984370698 
0.117876863479     0.8     0.619859836391 
0.193069772888     0.8      0.61520845309 
0.316227766017     0.8     0.613166521544 
0.517947467923     0.8     0.616748265966 
0.848342898244     0.8     0.620515018886 
 1.38949549437     0.8     0.621833915059 
 2.27584592607     0.8     0.622247047466 
 3.72759372031     0.8     0.623665290761 
 6.10540229659     0.8     0.623764631593 
     10.0          0.8     0.625338921914 
Para b: 0.8, el mejor a: 10.0 con un coeficiente de 0.625338921914


In [12]:
a5,b5 = best_params_dev(1)

     0.01           1      -0.474810808909
0.0163789370695     1       -0.4748095844 
0.0268269579528     1      -0.474807107352
0.0439397056076     1      -0.474803619231
0.0719685673001     1      -0.474798395156
0.117876863479      1      -0.47479103477 
0.193069772888      1      -0.474777144342
0.316227766017      1      -0.474755207871
0.517947467923      1      -0.474710966352
0.848342898244      1      -0.474645219185
 1.38949549437      1      -0.474553357813
 2.27584592607      1      -0.474208237872
 3.72759372031      1      -0.390015582279
 6.10540229659      1      -0.32843855412 
     10.0           1      -0.263461550071
Para b: 0, el mejor a: 0 con un coeficiente de 0


Los mejores parametros encontrados fueron:

$\alpha = 0$ y $\lambda = 2.27584592607$

Correlación obtenida : 

In [1]:
model = ElasticNet(fit_intercept = False)
model.set_params(alpha = a0, l1_ratio = b0)
model.fit(tr_x_s_matrix, tr_y_s_array)

print "R2=%f"%model.score(t_x_s_matrix, t_y_s_array)

NameError: name 'ElasticNet' is not defined