# Pregunta 4

### Lectura

In [65]:
import pandas as pd
import numpy as np
import tarfile
from urllib import urlretrieve

url = 'http://octopus.inf.utfsm.cl/~ricky/movies.tar.gz'

filename, headers = urlretrieve(url, 'movies.tar.gz')

In [144]:
tar = tarfile.open(filename)

dev_x = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/dev.x.mm"
test_x = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/test.x.mm"
dev_y = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/dev.y.dat"
test_y = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/test.y.dat"
train_x = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/train.x.mm"
train_y = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/train.y.dat"
vocab = "movies-preproc/ngrams-deprels-origin.runtime.budget.numscreen.ratings.seasons/vocab"

dev_x_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/dev.x.mm"
test_x_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/test.x.mm"
dev_y_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/dev.y.dat"
test_y_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/test.y.dat"
train_x_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/train.x.mm"
train_y_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/train.y.dat"
vocab_stars = "movies-preproc/ngrams-deprels-fp1-origin.runtime.budget.numscreen.ratings.seasons.stars/vocab"

for tarinfo in tar:

    if tarinfo.name == dev_x:
        d_x = tar.extractfile(tarinfo.name)
    if tarinfo.name == dev_y:
        d_y = tar.extractfile(tarinfo.name)
    if tarinfo.name == test_x:
        t_x = tar.extractfile(tarinfo.name)
    if tarinfo.name == test_y:
        t_y = tar.extractfile(tarinfo.name)
    if tarinfo.name == train_x:
        tr_x = tar.extractfile(tarinfo.name)
    if tarinfo.name == train_y:
        tr_y = tar.extractfile(tarinfo.name)
    if tarinfo.name == vocab:
        v = tar.extractfile(tarinfo.name)
        
    if tarinfo.name == dev_x_stars:
        d_x_s = tar.extractfile(tarinfo.name)
    if tarinfo.name == dev_y_stars:
        d_y_s = tar.extractfile(tarinfo.name)
    if tarinfo.name == test_x_stars:
        t_x_s = tar.extractfile(tarinfo.name)
    if tarinfo.name == test_y_stars:
        t_y_s = tar.extractfile(tarinfo.name)
    if tarinfo.name == train_x_stars:
        tr_x_s = tar.extractfile(tarinfo.name)    
    if tarinfo.name == train_y_stars:
        tr_y_s = tar.extractfile(tarinfo.name)
    if tarinfo.name == vocab_stars:
        v_s = tar.extractfile(tarinfo.name)

import scipy.io as sio
from scipy.sparse import csr_matrix


v_array = v.read().split("\n")
v_s_array = v_s.read().split("\n")

d_y_array = np.loadtxt(d_y)
t_y_array = np.loadtxt(t_y)
tr_y_array = np.loadtxt(tr_y)
d_y_s_array = np.loadtxt(d_y_s)
t_y_s_array = np.loadtxt(t_y_s)
tr_y_s_array = np.loadtxt(tr_y_s)

d_x_matrix = csr_matrix(sio.mmread(d_x))
t_x_matrix = csr_matrix(sio.mmread(t_x))
tr_x_matrix = csr_matrix(sio.mmread(tr_x))

d_x_s_matrix = csr_matrix(sio.mmread(d_x_s))
t_x_s_matrix = csr_matrix(sio.mmread(t_x_s))
tr_x_s_matrix = csr_matrix(sio.mmread(tr_x_s))

tar.close()

Es importante mantener el formato __sparse__ de las matrices, por un tema de optimización de memoria. Es necesario tener solo en memoria los datos no nulos de la matriz. Esto es significativo cuando se trabaja con matrices muy grandes, y donde muchos de los elementos son nulos. Asi, con un formato __sparse__, no hay necesidad de desperdiciar memoria en datos que no estan, haciendo mas eficiente el procesamiento de estas matrices.

### Modelo a utilizar
En esta ocación, el modelo a utilizar para modelar este problema es el _ElasticNet_, propuesto en el paper _"Movie Reviews and Revenues: An Experiment in Text Regression"_. Es una regresion lineal que combina la norma L1 y L2. Es un hibrido entre las penalizaciones de _Ridge_ y _Lasso_. Para un $\alpha = 0$, equivale a utilizar _Ridge_ y $\alpha = 1$ a _Lasso_. 

$$\theta = argmin \frac{1}{2n} \sum_{i=1}^{n}{(y_i-(\beta_0 + x^T_i\beta) )^2 + \lambda P(\beta)}$$
$$P(\beta) = \sum_{j=1}^p (\frac{1}{2} (1-\beta) \beta_j^2+\alpha |\beta_j|)$$

Para estimar los mejores parametros, utilizando los datos de entrenamiento, se fueron variando los valores de \alpha y \lambda, evaluando la funcion sobre los datos de desarrollo. A medida que se consige un mejor coeficiente de ajuste, se van guardando los paramentos.

In [None]:
def best_params_dev():
    ratio = np.arange(0.,1.2,0.2)
    alfa = np.logspace(-2, 1, num=30, base=10)
    model = ElasticNet(fit_intercept = False)
    best_a = 0
    best_b = 0
    best_coef = 0
    for b in ratio:
        for a in alfa:
            model.set_params(alpha = a, l1_ratio = b)
            model.fit(tr_x_matrix, tr_y_array)
            coef = model.score(d_x_matrix, d_y_array)
            if best_coef < coef:
                best_coef = coef
                best_a = a
                best_b = b
    return a,b

a,b = best_params_dev()

In [None]:
model = ElasticNet(fit_intercept = False)
model.set_params(alpha = a, l1_ratio = b)
model.fit(tr_x_matrix, tr_y_array)

print "R2=%f"%model.score(t_x_matrix, t_y_array)