In [4]:
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import seaborn.objects as so

from sklearn import linear_model    # Herramientas de modelos lineales
from sklearn.metrics import mean_squared_error, r2_score    # Medidas de desempeño
from sklearn.preprocessing import PolynomialFeatures    # Herramientas de polinomios

from formulaic import Formula

In [3]:
# Si necesitan instalar algún paquete
#pip install gapminder
!pip install formulaic



## Introducción: sistemas de ecuaciones lineales

Sabemos que un fondo de inversión invirtió en acciones de YPF, Santander y Nvidia (y solo en estas acciones) pero no sabemos cuántas acciones compró de cada una. ¿Cómo podemos averiguarlo? 

Suponemos que tenemos disponible:
1. La valorización del fondo al final de cada día.
2. El valor la acción de cada empresa al cierre de cada día.

In [5]:
# Cargamos los datos
dataDict = {'total': [170262,169929.5,171064,169637.35,164625.45], 
        'YPF': [20935, 21030, 20770, 20950, 20750], 
        'Santander': [20100, 20500, 21700, 21000, 20316], 
        'Nvidia': [37100, 36255, 36000, 35645.5, 33878.5]}
data = pd.DataFrame.from_dict(dataDict)
data

Unnamed: 0,total,YPF,Santander,Nvidia
0,170262.0,20935,20100,37100.0
1,169929.5,21030,20500,36255.0
2,171064.0,20770,21700,36000.0
3,169637.35,20950,21000,35645.5
4,164625.45,20750,20316,33878.5


Nos quedamos con las primeras tres filas y resolvemos el sistema lineal:
$$total = c_1 \cdot YPF + c_2 \cdot Santander + c_3 \cdot Nvidia$$


In [6]:
data_3rows = data[[True, True, True, False, False]]
data_3rows

Unnamed: 0,total,YPF,Santander,Nvidia
0,170262.0,20935,20100,37100.0
1,169929.5,21030,20500,36255.0
2,171064.0,20770,21700,36000.0


In [13]:
X_3rows = data_3rows[["YPF", "Santander", "Nvidia"]]
y_3rows = data_3rows["total"]
display(X_3rows)
display(y_3rows)

Unnamed: 0,YPF,Santander,Nvidia
0,20935,20100,37100.0
1,21030,20500,36255.0
2,20770,21700,36000.0


0    170262.0
1    169929.5
2    171064.0
Name: total, dtype: float64

Para obtener los valores de c_1, c_2 y c_3 resolvemos el sistema lineal utilizando `np.linalg.solve`

In [18]:
c = np.linalg.solve(X_3rows, y_3rows)
print(c)

[3.2 2.  1.7]


In [19]:
# Verificamos
X_3rows @ c

0    170262.0
1    169929.5
2    171064.0
dtype: float64

In [20]:
# Verificamos que se satisface también las otras ecuaciones
X = data[["YPF", "Santander", "Nvidia"]]
X @ c

0    170262.00
1    169929.50
2    171064.00
3    169637.35
4    164625.45
dtype: float64

In [21]:
# Como verificamos si coincide con los totales que teníamos?

In [22]:
x = data[["YPF","Santander","Nvidia"]]
y = data["total"]
x@ c - y

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
dtype: float64

# Caso de estudio: calorías de alimentos

In [24]:
df_nutricion = pd.read_csv('../Descargas/nutrition.csv')
df_nutricion

Unnamed: 0,FDC_ID,Item,Category,Calorias_kcal,Proteinas_g,Carbohidratos_g,GrasaTotal_g,Colesterol_mg,Fibra_g,Agua_g,Alcohol_g,VitaminaC_mg
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",Baked Products,307.0,5.88,41.18,13.24,0.0,1.2,35.50,,
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",Baked Products,330.0,4.34,53.42,11.27,0.0,1.4,27.86,,0.1
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",Baked Products,377.0,6.10,79.80,3.70,,,3.20,,
3,167515,"George Weston Bakeries, Thomas English Muffins",Baked Products,232.0,8.00,46.00,1.80,,,42.60,,
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",Baked Products,273.0,6.58,41.05,9.22,15.0,2.2,40.34,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7788,175300,"Game meat, buffalo, water, cooked, roasted","Lamb, Veal, and Game Products",131.0,26.83,0.00,1.80,61.0,0.0,68.81,,0.0
7789,175301,"Game meat, elk, raw","Lamb, Veal, and Game Products",111.0,22.95,0.00,1.45,55.0,0.0,74.38,,0.0
7790,175302,"Game meat, elk, cooked, roasted","Lamb, Veal, and Game Products",146.0,30.19,0.00,1.90,73.0,0.0,66.28,,0.0
7791,175303,"Game meat, goat, raw","Lamb, Veal, and Game Products",109.0,20.60,0.00,2.31,57.0,0.0,75.84,,0.0


Vemos que el DataFrame contiene muchos datos "NaN" (not a number). 

En este ejemplo consideramos que los datos faltantes representan que el alimento no contiene ese ingrediente y lo convertimos a 0.

In [25]:
# Utilizamos fillna para convertir NaN a 0.
df_nutricion = df_nutricion.fillna(0)

In [26]:
df_nutricion.head()

Unnamed: 0,FDC_ID,Item,Category,Calorias_kcal,Proteinas_g,Carbohidratos_g,GrasaTotal_g,Colesterol_mg,Fibra_g,Agua_g,Alcohol_g,VitaminaC_mg
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",Baked Products,307.0,5.88,41.18,13.24,0.0,1.2,35.5,0.0,0.0
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",Baked Products,330.0,4.34,53.42,11.27,0.0,1.4,27.86,0.0,0.1
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",Baked Products,377.0,6.1,79.8,3.7,0.0,0.0,3.2,0.0,0.0
3,167515,"George Weston Bakeries, Thomas English Muffins",Baked Products,232.0,8.0,46.0,1.8,0.0,0.0,42.6,0.0,0.0
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",Baked Products,273.0,6.58,41.05,9.22,15.0,2.2,40.34,0.0,0.0


In [28]:
df_nutricion.isna().sum()

FDC_ID             0
Item               0
Category           0
Calorias_kcal      0
Proteinas_g        0
Carbohidratos_g    0
GrasaTotal_g       0
Colesterol_mg      0
Fibra_g            0
Agua_g             0
Alcohol_g          0
VitaminaC_mg       0
dtype: int64

Proponemos un modelo lineal, donde la variable respuesta es combinación lineal de todas las otras variables.

Construimos las matrices X e y utilizando Formulaic

In [27]:
y, X = (
    Formula('Calorias_kcal ~ Proteinas_g + Carbohidratos_g + GrasaTotal_g + Colesterol_mg + Fibra_g + Agua_g + Alcohol_g + VitaminaC_mg')
    .get_model_matrix(df_nutricion)
)

In [33]:
X.head() # Vemos que nos agregó una columna de Intercept (termino independiente constante)

# Es razonable en este modelo usar un intercept?

Unnamed: 0,Intercept,Proteinas_g,Carbohidratos_g,GrasaTotal_g,Colesterol_mg,Fibra_g,Agua_g,Alcohol_g,VitaminaC_mg
0,1.0,5.88,41.18,13.24,0.0,1.2,35.5,0.0,0.0
1,1.0,4.34,53.42,11.27,0.0,1.4,27.86,0.0,0.1
2,1.0,6.1,79.8,3.7,0.0,0.0,3.2,0.0,0.0
3,1.0,8.0,46.0,1.8,0.0,0.0,42.6,0.0,0.0
4,1.0,6.58,41.05,9.22,15.0,2.2,40.34,0.0,0.0


In [34]:
y.head()

Unnamed: 0,Calorias_kcal
0,307.0
1,330.0
2,377.0
3,232.0
4,273.0


**Observación**

En este caso sencillo, podemos obtener lo mismo (excepto por la columna Intercept) utilizando una lista de columnas.


In [25]:
#X = df_nutrition[["Proteinas_g", "Carbohidratos_g", "GrasaTotal_g", "Colesterol_mg","Fibra_g", "Agua_g","Alcohol_g","VitaminaC_mg"]]
#y = df_nutrition["Calorias_kcal"]

Ajustamos el modelo lineal

In [29]:
modelo = linear_model.LinearRegression(fit_intercept = False)    # Inicializamos un modelo de Regresion Lineal. 
                                       # Como la matriz X ya tiene el intercept, no agregamos intercept en la regresión
    
modelo.fit(X, y)   # Realizamos el ajuste

Analizamos la "bondad" del ajuste.

In [30]:
y_pred = modelo.predict(X)
# Calculando el R^2
r2 = r2_score(y, y_pred)
print('R^2: ', r2)

# Calculando el ECM
ecm = mean_squared_error(y, y_pred)
print('Raiz cuadarada del ECM: ', np.sqrt(ecm))

R^2:  0.9957957009157294
Raiz cuadarada del ECM:  10.941275361853487


A priori es un buen modelo, tenemos 7792 observaciones y obtenemos R^2 casi igual a 1 con solo 9 variables.

Analizamos el modelo y veamos si podemos obtener una fórmula práctica.

In [57]:
modelo.coef_

array([[-1.99226673e+01,  4.34651763e+00,  4.18103540e+00,
         9.05028271e+00,  5.63009593e-03, -1.66584211e+00,
         2.12557302e-01,  7.05470045e+00, -2.51921498e-02]])

**Ejercicio:** Cómo podemos obtener estos coeficientes usando la función solve?

In [59]:
# Recordemos que para minimizar el error de Xc = y, tenemos que resolver X^T X c = X^T y


Analizando los coeficientes vemos que las variables Proteinas_g, Carbohidratos_g y GrasaTotal_g son las que tienen mayor peso en el modelo.
Veamos si podemos eliminarlas.

**Spoiler 1:** No podemos determinar la importancia de las variables en el modelo solo mirando los coeficientes, porque no sabemos si las variables están en escalas similares. Necesitamos primero normalizar las variables para hacer este análisis!

In [50]:
# Igualmente, probemos y veamos qué pasa...
y2, X2 = (
    Formula('Calorias_kcal ~ Proteinas_g + Carbohidratos_g + GrasaTotal_g')
    .get_model_matrix(df_nutricion)
)

In [51]:
modelo = linear_model.LinearRegression(fit_intercept = False)    # Inicializamos un modelo de Regresion Lineal. 
                                       # Como la matriz X ya tiene el intercept, no agregamos intercept en la regresión
    
modelo.fit(X2, y2)   # Realizamos el ajuste

In [52]:
y_pred = modelo.predict(X2)
# Calculando el R^2
r2 = r2_score(y, y_pred)
print('R^2: ', r2)

# Calculando el ECM
ecm = mean_squared_error(y, y_pred)
print('Raiz cuadarada del ECM: ', np.sqrt(ecm))

R^2:  0.9897535862208672
Raiz cuadarada del ECM:  17.080756108170736


In [44]:
modelo.coef_

array([[4.07996204, 4.00204437, 3.79862627, 8.80413501]])

In [47]:
# Y si eliminamos el intercept?
y3, X3 = (
    Formula('Calorias_kcal ~ Proteinas_g + Carbohidratos_g + GrasaTotal_g - 1')
    .get_model_matrix(df_nutricion)
)
X3

Unnamed: 0,Proteinas_g,Carbohidratos_g,GrasaTotal_g
0,5.88,41.18,13.24
1,4.34,53.42,11.27
2,6.10,79.80,3.70
3,8.00,46.00,1.80
4,6.58,41.05,9.22
...,...,...,...
7788,26.83,0.00,1.80
7789,22.95,0.00,1.45
7790,30.19,0.00,1.90
7791,20.60,0.00,2.31


In [48]:
modelo = linear_model.LinearRegression(fit_intercept = False)    # Inicializamos un modelo de Regresion Lineal. 
                                       # Como la matriz X ya tiene el intercept, no agregamos intercept en la regresión
    
modelo.fit(X3, y3)   # Realizamos el ajuste
y_pred = modelo.predict(X3)
# Calculando el R^2
r2 = r2_score(y, y_pred)
print('R^2: ', r2)

# Calculando el ECM
ecm = mean_squared_error(y, y_pred)
print('Raiz cuadarada del ECM: ', np.sqrt(ecm))

R^2:  0.9896069199334809
Raiz cuadarada del ECM:  17.202567994195224


**Spoiler 2**: ¿cómo elegimos con cuál modelo quedarnos? Un modelo con más variables va a dar siempre un error menor en los datos, pero eso ¿significa que el modelo es mejor? ¿Podemos asegurar que en otros datos nuevos también va a dar mejor?

Para ver cuál predice mejor tenemos que diseñar un esquema de validación de modelos, separar los datos en entrenamiento y testeo.