# Scikit-learn

## PCA

In [6]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [7]:
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 

# Regularization

In [10]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [11]:
df = pd.read_csv("./data/felicidad.csv")

In [12]:
df.head()

Unnamed: 0,country,rank,score,high,low,gdp,family,lifexp,freedom,generosity,corruption,dystopia
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [14]:
x = df[["gdp", "family", "lifexp", "freedom", "corruption", "generosity", "dystopia"]]
y = df[["score"]]

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [17]:
modelLinear = LinearRegression().fit(x_train, y_train)
y_predict_linear = modelLinear.predict(x_test)

modelLasso = Lasso(alpha=0.02).fit(x_train, y_train)
y_predict_lasso = modelLasso.predict(x_test)

modelRidge = Ridge(alpha=0.02).fit(x_train, y_train)
y_predict_ridge = modelRidge.predict(x_test)

In [19]:
# perdidas

linear_loss = mean_squared_error(y_test, y_predict_linear)
lasso_loss = mean_squared_error(y_test, y_predict_lasso)
ridge_loss = mean_squared_error(y_test, y_predict_ridge)


In [20]:
print("Linear Loss: ", linear_loss)
print("Lasso Loss: ", lasso_loss)
print("Ridge Loss: ", ridge_loss)

Linear Loss:  1.2018451762921518e-07
Lasso Loss:  0.06810434720920565
Ridge Loss:  5.739269544000182e-06


In [27]:
print("-"*100)
print("Coef Linear: ")
print(modelLinear.coef_)
print("-"*100)
print("Coef Lasso: ")
print(modelLasso.coef_)
print("-"*100)
print("Coef Ridge: ")
print(modelRidge.coef_)

----------------------------------------------------------------------------------------------------
Coef Linear: 
[[1.00025726 0.99986265 0.9996218  1.00012427 0.99966901 1.00021152
  0.99990603]]
----------------------------------------------------------------------------------------------------
Coef Lasso: 
[1.39649214 0.91227749 0.30349073 0.69139304 0.         0.15203574
 0.88806108]
----------------------------------------------------------------------------------------------------
Coef Ridge: 
[[1.00413412 0.99883315 0.99335731 0.99882266 0.98662905 0.9937571
  0.99888869]]


# Regresiones robustas

## Identificación de valores atípicos

#### Métodos estadísticos
1. Z-score
2. DBSCAN
3. si q < Q1-1.5*IQR o 1 > Q3+1.5*IQR 

En Scikit-learn hay dos varias opciones para realizar regresiones robustas:

##### RANSAC (Random Sample Consensus)

Muestreo aleatorio buscando la que tenga más "valores buenos". Supone que los valores atípicos no tienen patriones específicos.


##### Huber Reggresor
Disminuye los valores atípicos, disminuyendo su influencia en el modelo. Utiliza un epsilon (el mejor estadísticamente es 0.35)