# 1. Árbol de decisión para clasificación

**Objetivo:** entrenar y probar un modelo de árbol de decisión para clasificar tipo de uso de suelo a partir de imágenes satelitales.


Este dataset es usado para clasificar el uso de suelo en imágenes geoespaciales. 
https://www.kaggle.com/apollo2506/eurosat-dataset

**Información de las características**
Este dataset contiene imágenes que pertenecen all dataset de EuroSat. Hay 10 folders:
* 0 AnnualCrop
* 1 Forest
* 2 HerbaceousVegatation
* 3 Highway
* 4 Industrial
* 5 Pasture
* 6 PermanentCrop
* 7 Residential
* 8 River
* 9 SeaLake


**Número de instancias:** 27000

# 2. Autenticación a Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 3. Importando librerías

In [None]:
import pandas as ____
import os
import numpy as ____
import itertools
from sklearn.metrics import confusion_matrix
import ____.pyplot as ____
import cv2
import random
import _____ as sns

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn import metrics

# 4. Lectura del archivo

In [None]:
path = r'/content/drive/Shareddrives/Data Science para Geociencias/6. Métodos de ML/6.3 Árbol de decisión'
train_path = os.path.join(path,'EUROSAT_TRAIN_FEAT.csv')
test_path = os.path.join(path,'EUROSAT_TEST_FEAT.csv')

In [None]:
train_df = pd.read_csv(____)
test_df = pd.______(____)
train_df.head()

In [None]:
clases = train_df['label'].unique()

# 5. Exploración de datos (EDA)

In [None]:
train_path_or = r"/content/drive/My Drive/EUROSAT/train"
folders = os.listdir(train_path_or)

plt.figure(figsize=(15,15))
for i,folder in enumerate(folders):
    path_folder = os.path.join(train_path_or, folder)
    imgs_list =os.listdir(path_folder)
    random.shuffle(imgs_list)
    for j in range(7):
        img_path = os.path.join(path_folder,imgs_list[j])
        plt.subplot(10,10,i*10+j+1)
        img = plt.imread(img_path)
        plt.imshow(img)
        #plt.axis("off")
        plt.tick_params(axis='both',which='both', bottom=False, top=False, left=False, right=False,
                        labelbottom=False, labelleft=False)
        if j==0:
            plt.ylabel(folder)
plt.grid('off')
plt.show()

# 6. Limpieza de datos

#### a) Escalamiento

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
____.loc[:, train_df.columns != 'label'] = scaler.fit_transform(train_df.loc[:, train_df.columns != 'label'])

In [None]:
test_df.loc[:, test_df.columns != 'label'] = scaler.transform(test_df.loc[:, test_df.columns != 'label'])
val_df.loc[:, val_df.columns != 'label'] = scaler.transform(val_df.loc[:, val_df.columns != 'label'])

#### b) Codificación de etiquetas

In [None]:
le = LabelEncoder()
train_df['label'] = le._____(train_df.label.values)

In [None]:
test_df['label'] = le.transform(test_df.label.values)
val_df['label'] = le._____(val_df.label.values)

### c) Selección de características

In [None]:
fs = SelectKBest(score_func=f_classif, k=50)
np_X = train_df.iloc[:,:-1].to_numpy()
np_Y = train_df.iloc[:,-1]._____()
X_selec = fs.fit_transform(np_X, np_Y)
attr_names = train_df.columns.values.tolist()
# Lista de bools que indican si es un atributo seleccionado o no
mask = fs.get_support()
# Lista de los K mejores atributos
new_features = [] 
for bool, feature in zip(mask, attr_names):
    if bool:
        new_features.append(feature)
new_train_df = pd.DataFrame(X_selec, columns=new_features)
new_train_df['label'] = train_df['label']

In [None]:
new_test_df = test_df[new_features].copy()
new_test_df['label'] = test_df['label']

In [None]:
new_val_df = val_df[new_features].____()
new_val_df['label'] = val_df['label']

# 7. Modelo de árbol de decisión CART usando Holdout validation


a) Transformación de data frames a arreglos de numpy

In [None]:
Xtrain = new_train_df.iloc[:,:-1].to_numpy() 
Xtest = new_test_df.iloc[:,:-1].______() 
Ytrain = new_train_df.iloc[:,-1].to_numpy() 
Ytest = new_test_df.iloc[:,-1].to_numpy() 

b) Binarizando las etiquetas de clases con la codificación One-Hot

In [None]:
seed = 6

In [None]:
n_classes = 10

c) Instanciando un árbol de decisión

In [None]:
dectree = DecisionTreeClassifier(random_state=____, max_depth = 3)

d) Ajustando el árbol con los datos de entrenamiento

In [None]:
dectree = dectree.fit(____, _____)

In [None]:
feat_names =  new_train_df.iloc[:,:-1].columns.tolist()
print(_____)

In [None]:
plt.figure(figsize = (25,12))
plot_tree(dectree, feature_names = feat_names, class_names = clases, filled = True)

# 8. Prediciendo para los datos de prueba

In [None]:
y_pred = dectree.predict(____)

a) Calculando el rendimiento general del modelo

In [None]:
score = metrics.accuracy_score(Ytest, ____)
print("Test Acc: %s"%____)

b) Predicciones vs etiquetas verdaderas

In [None]:
predictions = np.float32(____)
true_labels = np.float32(____)

c) Matriz de confusión para evaluar los errores

In [None]:
def plot_confusion_matrix(cm, classes, tit, normalize=False):
    if normalize:
        cm = cm.astype('float')/cm.sum(axis=1)
        title, fmt = 'Matriz de confusión normalizada', '.2f'
    else:
        title, fmt = tit, 'd'
    plt.figure(figsize=(10,8))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)#, fontsize=12)
    plt.colorbar(pad=0.05)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=40)
    plt.yticks(tick_marks, classes)
    thresh = cm.max()/2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),horizontalalignment="center", 
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('Clase Verdadera')
    plt.xlabel('Clase Predicha')
    plt.savefig(title+'.png')
    #plt.grid(b=None)
    plt.show()

In [None]:
cnf_matrix = confusion_matrix(_____, _____, labels=range(_____))
tit = 'Matriz de confusión árbol de decisión (CART)'
plot_confusion_matrix(cnf_matrix,clases, tit, normalize=False)

e) Otras métricas para evaluar el rendimiento

In [None]:
def sens_spec(cls_names, y_true, y_pred, pesos):
  sensitivity = []
  specificity = []
  prec=[]
  fscore = []
  sd = []
  for i,name in enumerate(cls_names):
    TP = np.sum((y_true==name) & (y_pred==name))
    TN = np.sum((y_true!=name) & (y_pred!=name))
    FP = np.sum((y_true!=name) & (y_pred==name))
    FN = np.sum((y_true==name) & (y_pred!=name))
    sensitivity.append(TP/(TP+FN))
    specificity.append(FP/(TN+FP))
    prec.append(TP/(TP+FP))
    fscore.append(2*(prec[i]*sensitivity[i])/(prec[i]+sensitivity[i]))
  sensitivity.append(sum([x*y for x,y in zip(sensitivity,pesos)]))
  specificity.append(sum([x*y for x,y in zip(specificity,pesos)]))
  prec.append(sum([x*y for x,y in zip(prec,pesos)]))
  fscore.append(sum([x*y for x,y in zip(fscore,pesos)]))
  for i in range(len(cls_names)):
    sd.append(np.sqrt((sensitivity[i]-sensitivity[-1])**2))
  sd.append(np.nan)
  return sensitivity, specificity, prec, fscore, sd

In [None]:
sensitivity, FPR, prec, fscore, sd = sens_spec(range(_____), ______, ______, [1.0/10]*10)
d = {'TPR':sensitivity, 'FPR':FPR, 'Acc':prec, 'F-score':fscore}
ind = clases.tolist()+['Promedio']
df = pd.DataFrame(d, index=ind)
index = df.index
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.heatmap(df, annot=True)