# Boosting para el dataset EuroSAT

# 1. Acceso a drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 2. Importando módulos

In [None]:
import ____ as pd
import os
from sklearn.preprocessing import _____ #Codificación
from sklearn.preprocessing import _____ #Escalador
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import ____ as np
import itertools
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as ____
import cv2
import random
import _____ as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA

# 3. Lectura de los datos

In [None]:
path = r'/content/drive/Shareddrives/Data Science para Geociencias/6. Métodos de ML/6.2 Árbol de decisión'
train_path = os.path.join(path,'EUROSAT_TRAIN_FEAT.csv')

In [None]:
train_df = pd.read_csv(_____)
clases = ____['label'].unique()
train_df.head()

In [None]:
clases = train_df['label'].unique().tolist()
n_clases = len(____)
print(clases)

Cálculo de los pesos

In [None]:
pesos = train_df['label'].value_counts().sort_index().tolist()/ np.sum(train_df['label'].value_counts().tolist())
print(____)

# 3. Codificación de las etiquetas y escalamiento

In [None]:
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df.label.values)

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
train_df.loc[:, train_df.columns != 'label'] = scaler._____(train_df.loc[:, train_df.columns != 'label'])

# 4. Análisis de componentes principales

In [None]:
pca = PCA(0.8)
pc = pca.fit_transform(train_df.iloc[:,:-1])
df_pca = pd.DataFrame(data = pc,
                           columns=range(pc.shape[1]))
df_pca = pd.concat([df_pca, train_df.iloc[:,-1]], axis = 1)

In [None]:
print('Número de componentes principales: %s'%len(pca.explained_variance_ratio_))
print('Varianza acumulada con %s componentes: %s'%(len(____.explained_variance_ratio_),np.sum(____.explained_variance_ratio_)))

In [None]:
df_pca.head()

# 5. Evaluación para los modelos

In [None]:
def plot_confusion_matrix(cm, classes, tit, normalize=False):
    if normalize:
        cm = cm.astype('float')/cm.sum(axis=1)
        title, fmt = 'Matriz de confusión normalizada', '.2f'
    else:
        title, fmt = tit, 'd'
    plt.figure(figsize=(10,8))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)#, fontsize=12)
    plt.colorbar(pad=0.05)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=40)
    plt.yticks(tick_marks, classes)
    thresh = cm.max()/2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),horizontalalignment="center", 
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('Clase Verdadera')#, fontsize=10)
    plt.xlabel('Clase Predicha')#, fontsize=10)
    plt.savefig(title+'.png')
    plt.grid(False)
    plt.show()

def sens_spec(cls_names, y_true, y_pred, pesos):
  sensitivity = []
  specificity = []
  prec=[]
  fscore = []
  sd = []
  for i,name in enumerate(cls_names):
    TP = np.sum((y_true==name) & (y_pred==name))
    TN = np.sum((y_true!=name) & (y_pred!=name))
    FP = np.sum((y_true!=name) & (y_pred==name))
    FN = np.sum((y_true==name) & (y_pred!=name))
    sensitivity.append(TP/(TP+FN))
    specificity.append(FP/(TN+FP))
    prec.append(TP/(TP+FP))
    fscore.append(2*(prec[i]*sensitivity[i])/(prec[i]+sensitivity[i]))
  sensitivity.append(sum([x*y for x,y in zip(sensitivity,pesos)]))
  specificity.append(sum([x*y for x,y in zip(specificity,pesos)]))
  prec.append(sum([x*y for x,y in zip(prec,pesos)]))
  fscore.append(sum([x*y for x,y in zip(fscore,pesos)]))
  for i in range(len(cls_names)):
    sd.append(np.sqrt((sensitivity[i]-sensitivity[-1])**2))
  sd.append(np.nan)
  return sensitivity, specificity, prec, fscore, sd

In [None]:
X = np.array(train_df.iloc[:,:-1])#(df_pca.iloc[:,:-1])
y = np.array(_____['label'])#(df_pca['label'])

# 6. Árbol de decisión (baseline)

In [None]:
cart =  DecisionTreeClassifier(max_depth=5)
kfold = KFold(n_splits=10, shuffle=True)

In [None]:
score = cross_val_score(cart, X, y, cv=kfold, scoring="accuracy")
y_pred = cross_val_predict(cart, X, y, cv=10)
predictions = np.float32(y_pred)
true_labels = np.float32(y)
cnf_matrix = confusion_matrix(true_labels, predictions, labels=range(n_clases))
tit = 'Matriz de confusión CART'
plot_confusion_matrix(cnf_matrix,clases, tit, normalize=False)

In [None]:
sensitivity, FPR, prec, fscore, sd = sens_spec(range(n_clases), true_labels, predictions, pesos)
d = {'TPR':sensitivity, 'FPR':FPR, 'Acc':prec, 'F-score':fscore}
ind = clases+['Promedio']
df = pd.DataFrame(d, index=ind)#.transpose()
index = df.index
precprom = sum([x*y for x,y in zip(prec,pesos)])
index.name = 'Acc Pond: %s'%precprom
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.heatmap(df, annot=True)

# ¿Cuántos weak learners son necesarios?

In [None]:
Xclassifiers = []
Yscore = []
kfold = KFold(n_splits=2, shuffle=True)
cart =  DecisionTreeClassifier(max_depth=5)
for i in range(500):
  if i%50 == 0:
    boost = AdaBoostClassifier(base_estimator=____, n_estimators=i+1)
    # Ajustando los datos
    score = cross_val_score(____, X, y, cv=kfold, scoring="accuracy")
    Yscore.append(score)
    Xclassifiers.append(i+1)
    print("iter: %s, score: %s"%(i+1,np.mean(score)))


In [None]:
plt.figure()
plt.plot(Xclassifiers, Yscore, lw=2)
plt.xlabel('Clasificadores')
plt.ylabel('Score Promedio')
plt.title('Score vs Classificadores: AdaBoost + CART')
print(np.max(Yscore))
plt.grid()
plt.show() 

# 7. Boosting con CART

In [None]:
cart = DecisionTreeClassifier(max_depth=5)
kfold = KFold(n_splits=2, shuffle=True)
boost = AdaBoostClassifier(base_estimator=____, n_estimators=500)

In [None]:
score = cross_val_score(____, X, y, cv=kfold, scoring="accuracy")
print(score)

[0.833      0.84222222]


In [None]:
y_pred = cross_val_predict(boost, X, y, cv=2)

In [None]:
predictions = np.float32(y_pred)
true_labels = np.float32(y)
cnf_matrix = confusion_matrix(true_labels, predictions, labels=range(n_clases))
tit = 'Matriz de confusión AdaBoost'
plot_confusion_matrix(cnf_matrix,clases, tit, normalize=False)

In [None]:
sensitivity, FPR, prec, fscore, sd = sens_spec(range(n_clases), true_labels, predictions, pesos)
d = {'TPR':sensitivity, 'FPR':FPR, 'Acc':prec, 'F-score':fscore}
ind = clases+['Promedio']
df = pd.DataFrame(d, index=ind)#.transpose()
index = df.index
precprom = sum([x*y for x,y in zip(prec,pesos)])
index.name = 'Acc Pond: %s'%precprom
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.heatmap(df, annot=True)