# Scikit-learn

## PCA

In [47]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 

# Regularization

In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv("./data/felicidad.csv")

In [5]:
df.head()

Unnamed: 0,country,rank,score,high,low,gdp,family,lifexp,freedom,generosity,corruption,dystopia
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [6]:
x = df[["gdp", "family", "lifexp", "freedom", "corruption", "generosity", "dystopia"]]
y = df[["score"]]

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [8]:
modelLinear = LinearRegression().fit(x_train, y_train)
y_predict_linear = modelLinear.predict(x_test)

modelLasso = Lasso(alpha=0.02).fit(x_train, y_train)
y_predict_lasso = modelLasso.predict(x_test)

modelRidge = Ridge(alpha=0.02).fit(x_train, y_train)
y_predict_ridge = modelRidge.predict(x_test)

In [9]:
# perdidas

linear_loss = mean_squared_error(y_test, y_predict_linear)
lasso_loss = mean_squared_error(y_test, y_predict_lasso)
ridge_loss = mean_squared_error(y_test, y_predict_ridge)


In [10]:
print("Linear Loss: ", linear_loss)
print("Lasso Loss: ", lasso_loss)
print("Ridge Loss: ", ridge_loss)

Linear Loss:  8.145783653455077e-08
Lasso Loss:  0.03851775568489931
Ridge Loss:  4.472890885791426e-06


In [11]:
print("-"*100)
print("Coef Linear: ")
print(modelLinear.coef_)
print("-"*100)
print("Coef Lasso: ")
print(modelLasso.coef_)
print("-"*100)
print("Coef Ridge: ")
print(modelRidge.coef_)

----------------------------------------------------------------------------------------------------
Coef Linear: 
[[1.00008309 0.99993268 0.99993731 0.99996598 0.99986044 1.00021989
  0.99991989]]
----------------------------------------------------------------------------------------------------
Coef Lasso: 
[1.31424058 0.901335   0.41930847 0.71924042 0.         0.39484805
 0.9223729 ]
----------------------------------------------------------------------------------------------------
Coef Ridge: 
[[1.0036438  0.99843124 0.99458349 0.99966228 0.98374243 0.99607552
  0.99915629]]


# Regresiones robustas

## Identificación de valores atípicos

#### Métodos estadísticos
1. Z-score
2. DBSCAN
3. si q < Q1-1.5*IQR o 1 > Q3+1.5*IQR 

En Scikit-learn hay dos varias opciones para realizar regresiones robustas:

##### RANSAC (Random Sample Consensus)

Muestreo aleatorio buscando la que tenga más "valores buenos". Supone que los valores atípicos no tienen patriones específicos.


##### Huber Reggresor
Disminuye los valores atípicos, disminuyendo su influencia en el modelo. Utiliza un epsilon (el mejor estadísticamente es 0.35)

# Métodos de ensamble
Combinan diferentes modelos y se aplican un método para lograr consenso.

## Algoritmo de Bagging (Bootstrap AGGgragation)
Se relizan particiones y se entrenan por separado (con el mismo o con diferentes modelos) y al final se elige la opción que tenga más votos (haya salido más) o el promedio de las soluciones.

## Algoritmo de Boosting (impulsar/propulsar)
El output (resultado y error) es dado a otro método junto con los datos iniciales

In [13]:
## Bagging
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [16]:
df_heart = pd.read_csv("./data/heart.csv")

X = df_heart.drop(["target"], axis=1)
y = df_heart["target"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [18]:
knn_class = KNeighborsClassifier().fit(X_train, y_train)
knn_pred = knn_class.predict(X_test)

print(accuracy_score(knn_pred, y_test))

0.7142857142857143


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [19]:
bag_class = BaggingClassifier(base_estimator=KNeighborsClassifier(), n_estimators=50).fit(X_train, y_train)
bag_pred = bag_class.predict(X_test)
print(accuracy_score(bag_pred, y_test))

0.7597402597402597


In [24]:
# Boosting, gradient tree
from sklearn.ensemble import GradientBoostingClassifier

boost = GradientBoostingClassifier(n_estimators=50).fit(X_train, y_train)
boost_pred = boost.predict(X_test)

print(accuracy_score(boost_pred, y_test))


0.9090909090909091


# Clustering

In [27]:
# Batch k-means

from sklearn.cluster import MiniBatchKMeans

df = pd.read_csv("./data/candy.csv")

X = df.drop("competitorname", axis=1)

kmeans = MiniBatchKMeans(n_clusters=4, batch_size=8).fit(X)

kmeans.predict(X)



array([1, 1, 3, 0, 2, 2, 2, 3, 0, 3, 0, 3, 3, 0, 0, 0, 0, 0, 2, 3, 2, 0,
       2, 2, 2, 2, 3, 2, 1, 0, 0, 2, 1, 1, 0, 2, 1, 2, 1, 2, 2, 2, 1, 1,
       3, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 3, 1, 3, 0, 3, 1, 2, 0, 0, 1, 2,
       2, 2, 1, 3, 3, 3, 3, 2, 2, 0, 0, 2, 2, 1, 0, 0, 0, 0, 2])

In [28]:
df["group"]= kmeans.predict(X)

## Mean-Shift
Para determinar cantidad de grupoa.

In [32]:
from sklearn.cluster import MeanShift

In [34]:
meanshift = MeanShift().fit(X)
meanshift.labels_

array([2, 2, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 2, 1, 2, 0, 1, 2, 0, 0, 1, 2, 2, 0, 1, 2, 2, 2, 1, 1, 1, 2, 2,
       0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 1, 0, 0, 2, 2,
       2, 1, 2, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 2, 0, 0, 0, 0, 1],
      dtype=int64)

# Validación cruzada (cross-validation)

In [41]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold

In [42]:
df_felicidad = pd.read_csv("./data/felicidad.csv")

In [43]:
X = df_felicidad.drop(["country", "score"], axis=1)
y = df_felicidad["score"]

In [45]:
model = DecisionTreeRegressor()
score = cross_val_score(model, X, y,cv=3, scoring="neg_mean_squared_error")
score

array([-0.8214714 , -0.15421774, -0.74427378])

In [48]:
np.mean(score)

-0.5733209746042852

In [49]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

for train, test in kf.split(df_felicidad):
    print(train)

[  0   1   2   3   4   5   6   7   8  10  13  14  16  17  20  21  23  25
  28  32  33  34  35  37  38  39  40  41  43  44  46  47  48  49  50  52
  53  54  57  58  59  61  62  63  64  67  70  71  72  73  74  77  80  83
  87  88  89  91  92  94  97  98  99 100 101 102 103 104 105 106 107 108
 110 111 112 113 114 115 116 120 121 123 125 127 128 129 130 132 134 135
 136 139 140 143 144 145 146 148 149 150 151 152 154]
[  1   2   3   6   8   9  11  12  13  14  15  17  18  19  20  21  22  24
  26  27  29  30  31  36  37  38  42  45  48  50  51  52  54  55  56  57
  58  59  60  63  65  66  68  69  71  72  74  75  76  78  79  81  82  83
  84  85  86  87  88  89  90  91  92  93  95  96  99 100 102 103 106 107
 109 112 115 116 117 118 119 120 121 122 124 126 128 129 130 131 132 133
 135 137 138 139 140 141 142 145 147 149 152 153 154]
[  0   4   5   7   9  10  11  12  15  16  18  19  22  23  24  25  26  27
  28  29  30  31  32  33  34  35  36  39  40  41  42  43  44  45  46  47
  49  51  53  55

# Optimización de parámetros
Modulo de optimización paramétrica

- manual
- grilla de parámetros (GridSearchCV)
- parametrización aleatorizada (RandomizedSearchCV)

In [50]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

In [52]:
df_felicidad.head()

Unnamed: 0,country,rank,score,high,low,gdp,family,lifexp,freedom,generosity,corruption,dystopia
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [54]:
reg = RandomForestRegressor()

parametros = {
    "n_estimators": range(4,16),
    "criterion": ["mse", "mae"],
    "max_depth": range(2,11)
}

rand_est = RandomizedSearchCV(reg, parametros, n_iter=100, cv=3, scoring="neg_mean_absolute_error").fit(X_train, y_train)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [55]:
rand_est.best_estimator_

RandomForestRegressor(criterion='mse', max_depth=9, n_estimators=5)