### **Packages and dataset load**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sb
from scipy import stats
from scipy.stats import pearsonr
from tqdm import tqdm
# import dataframe_image as df___i

color = {"granate":"#BA4A00",
         "amarillo":"#F5B041",
         "verde":"#148F77",
         "blue":"#0051A2",
         "red": "#DD1717"}
color_palette = [color["blue"], 'darkorchid', color['verde'], color['amarillo'],'gray', 'cornflowerblue', color['red']]
sb.set_style('white')

In [2]:
df = pd.read_csv('Data/Dry_Bean_Dataset.csv')

### **Auxiliar functions**

Cross validation

In [3]:
def dataset_creator(models_names: list, columns_names: list, k1: int):
    header = pd.MultiIndex.from_product([models_names, columns_names])
    df = pd.DataFrame(columns=header)
    df['KFold'] = np.arange(1, k1+1)
    df.set_index('KFold', inplace=True)
    return df

def twolevelcv(X: np.array, y: np.array, k1: int, k2: int, models: list, params: dict, rs: int):
    """Allows to compute two level crossvalidation.

    Args:
        X (np.array): Features (numeric)
        y (np.array): Class (objective variable)
        k1 (int): Nº of outer folds
        k2 (int): Nº of inner folds
        models (list): List of models for comparison
        params (dict): Dictionary including the set of parameters. In this case we only tune 1 parameter per model.
        rs (int): Random state
    Returns:
        df: Dataframe
    """
    test_error_dict = {}
    k = 0
    names = [type(m).__name__ for m in models]
    col_names = ['Param. Value', 'Error']
    df = dataset_creator(names, col_names, k1)
    kf1 = StratifiedKFold(k1, shuffle = True, random_state=rs)
    # first level split
    for train_idx1, test_idx1 in kf1.split(X, y):
        k += 1
        kf2 = StratifiedKFold(k2, shuffle = True, random_state=rs)
        print(f'Computing KFold {k}/{k1}...')
        # second level split
        for train_idx2, test_idx2 in tqdm(kf2.split(X[train_idx1, :], y[train_idx1]), total = k2):
            X_train = X[train_idx2, :]
            y_train = y[train_idx2]
            X_test = X[test_idx2, :]
            y_test = y[test_idx2]
            for name, model in zip(names, models):
                if name != 'DummyClassifier':
                    pname = list(params[name].keys())[0]
                    error_test = []
                    for p_ in params[name][pname]:
                        pdict = {pname: p_}
                        model = model.set_params(**pdict)
                        # train the model
                        model.fit(X_train, y_train)
                        # evaluate performance
                        pred2_test = model.predict(X_test)
                        error_test.append(np.sum(pred2_test != y_test)/ y_test.shape[0])
                    min_param = params[name][pname][np.argmin(error_test)]
                else:
                    model.fit(X_train, y_train)
                    pred2_test = model.predict(X_test)
                    error_test = np.sum(pred2_test != y_test)/ y_test.shape[0]
                    min_param = np.NaN
                df.loc(axis = 1)[name, 'Error'][k] = np.min(error_test)
                df.loc(axis = 1)[name, 'Param. Value'][k] = min_param
    return df, test_idx1

# **1 - Regression**

In [4]:
columns = df.columns.values
X = df.drop(columns='Class').values
y = df['roundness']


print('· NUMBER OF FEATURES:', X.shape[1])
print('\n· FEATURES:\n', columns[:-1])
print('\n· NUMBER OF DATA POINTS:', X.shape[0])

· NUMBER OF FEATURES: 16

· FEATURES:
 ['Area' 'Perimeter' 'MajorAxisLength' 'MinorAxisLength' 'AspectRation'
 'Eccentricity' 'ConvexArea' 'EquivDiameter' 'Extent' 'Solidity'
 'roundness' 'Compactness' 'ShapeFactor1' 'ShapeFactor2' 'ShapeFactor3'
 'ShapeFactor4']

· NUMBER OF DATA POINTS: 13611


### **Part A. *Linear regression.***

### **Part B. *Other models. Evaluation.***

### ANN

In [41]:
from matplotlib.pylab import figure, plot, xlabel, ylabel, legend, ylim, show
import sklearn.linear_model as lm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


device = "cuda" if torch.cuda.is_available() else 'cpu'
print("device: ", device)

x = df.drop(columns='Class').drop(columns='MajorAxisLength').values
x = np.array(x)
print(x.shape)
y = df['roundness']
y = np.array(y)
y = y.reshape((-1,1))

class ANN(nn.Module):
    def __init__(self, num_input, num_output, num_hidden):
        super(ANN, self).__init__()
        self.Net = nn.Sequential(
            nn.Linear(num_input, num_hidden),
            nn.ReLU(),
            nn.Linear(num_hidden, num_output),
            # nn.Sigmoid()
        )
        
    def forward(self, input):
        return self.Net(input)

model = ANN(x.shape[1], y.shape[1], 32).to(device)

batch_size = 64
x = x.astype(np.float32)
y = y.astype(np.float32)
x_train = torch.from_numpy(x).to(device)
y_train = torch.from_numpy(y).to(device)
train_set = TensorDataset(x_train, y_train)
train_set = DataLoader(train_set, batch_size=batch_size, shuffle=True)

epochs = 1000
loss_func = torch.nn.MSELoss()
optim = torch.optim.Adam(model.parameters(), lr=1e-4)

loss_ = []

for i in range(epochs):
    for x, y in train_set:
        output = model(x)
        loss = loss_func(output, y)
        loss_.append(loss.cpu().detach().numpy())
        optim.zero_grad()
        loss.backward()
        optim.step()
    if i % 100 == 0:
        print(loss.cpu().detach().numpy())

device:  cpu
(13611, 15)
169.77568


KeyboardInterrupt: 

In [40]:
output = model(x_train)
print(output.cpu().detach().numpy()[0], y[0])

[0.84749925] tensor([0.7396])


# **2 - Classification**

### **Dataset preparation**

In [5]:
# Copying object without editing the original
df_ = df.copy(deep=True)
# Doing this we can choose to use outliers filter or not

In [6]:
columns = df_.columns.values
X = df_.drop(columns='Class').values
y = df_['Class']
le = LabelEncoder()
y_ = le.fit_transform(y)
classes = y.unique()

print('· NUMBER OF FEATURES:', X.shape[1])
print('\n· FEATURES:', columns[:-1])
print('\n· NUMBER OF DATA POINTS:', X.shape[0])
print('\n· CLASSES:', classes)
print('\n· NUMBER OF CLASSES:', len(classes))

· NUMBER OF FEATURES: 16

· FEATURES: ['Area' 'Perimeter' 'MajorAxisLength' 'MinorAxisLength' 'AspectRation'
 'Eccentricity' 'ConvexArea' 'EquivDiameter' 'Extent' 'Solidity'
 'roundness' 'Compactness' 'ShapeFactor1' 'ShapeFactor2' 'ShapeFactor3'
 'ShapeFactor4']

· NUMBER OF DATA POINTS: 13611

· CLASSES: ['SEKER' 'BARBUNYA' 'BOMBAY' 'CALI' 'HOROZ' 'SIRA' 'DERMASON']

· NUMBER OF CLASSES: 7


#### **Transformations**

Outliers removal

In [7]:
Threshold_ = 3
outlier_index = []
df_ = pd.DataFrame(columns=df.columns)
index = 0
for K in classes:
    outlier_index = []
    a = df.loc[df["Class"] == K]
    value = a.drop(columns='Class').values
    for j in range(16):
        std = np.std(value[:, j])
        mean = np.mean(value[:, j])
        for i in range(value[:, j].shape[0]):
            if (value[i, j] - mean) / std > Threshold_:
                outlier_index.append(i + index)
    index = i + index + 1
    outlier_index = np.unique(outlier_index)
    a = a.drop(outlier_index)
    df_ = pd.concat([df_,a])
df_.reset_index(drop=True, inplace=True)
print(f'Filtered outliers: {df.shape[0] - df_.shape[0]}')

Filtered outliers: 298


Standarization

In [8]:
# Standarization of the dataset
sc = StandardScaler()
X_stdz = sc.fit_transform(X)
df_stdz = pd.DataFrame(columns = columns[:-1], data = X_stdz)
df_stdz['Class'] = y_

### **2.2 Logistic regression *vs.* Neural Network *vs.* Baseline**

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier 
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
random_state = 1

#### Logistic regression

In [10]:
# model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [11]:
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# # evaluate the model and collect the scores
# n_scores = cross_val_score(model, X_stdz, y_, scoring = 'accuracy', cv=cv, n_jobs=-1)
# # report the model performance
# print('Mean Accuracy: %.3f (+-%.3f)' % (np.mean(n_scores), np.std(n_scores)))

---

### **2.3 Cross-Validation table**

In [22]:
params = {}
lam = np.logspace(-6, 2, 100)
C = 1/ lam
# C = [200000000, 10000000, 0.1519911082952933, 0.2848035868435805 ]
params['LogisticRegression'] = {'C': C}
params['DummyClassifier'] = [None]
params['MLPClassifier'] = {'hidden_layer_sizes': [(8, ), (16, ), (20, )]}
models = [LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000000, random_state= random_state, tol = 0.003, n_jobs = -1),
        DummyClassifier(strategy='most_frequent', random_state=random_state),
        MLPClassifier(solver='adam', activation='logistic', alpha=1e-4, random_state=random_state, max_iter=1000, 
        early_stopping=True, validation_fraction=0.2, warm_start=True, verbose=False, learning_rate ='adaptive', learning_rate_init=0.01)]
k1 = 10
k2 = 10
Table, test_set_outer = twolevelcv(X = X_stdz, y = y_, k1 = k1, k2 = k2, models = models, params = params, rs = random_state)
Table.to_csv('Results/Test2_saga.csv')

Computing KFold 1/10...


 70%|███████   | 7/10 [04:28<01:59, 39.67s/it]

In [19]:
Table

Unnamed: 0_level_0,LogisticRegression,LogisticRegression,DummyClassifier,DummyClassifier,MLPClassifier,MLPClassifier
Unnamed: 0_level_1,Param. Value,Error,Param. Value,Error,Param. Value,Error
KFold,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,200000000.0,0.040033,,0.788399,"(16,)",0.039216
2,200000000.0,0.034286,,0.78449,"(8,)",0.035102
3,200000000.0,0.034286,,0.78449,"(8,)",0.034286
4,200000000.0,0.034286,,0.78449,"(8,)",0.035918
5,200000000.0,0.034286,,0.78449,"(8,)",0.035918
6,0.151991,0.035918,,0.790204,"(8,)",0.040816
7,0.151991,0.035918,,0.790204,"(8,)",0.04
8,0.151991,0.033469,,0.790204,"(8,)",0.041633
9,0.151991,0.033469,,0.790204,"(8,)",0.042449
10,0.151991,0.042449,,0.788571,"(8,)",0.044082


In [29]:
Table = Table.round(3)
Table.to_csv(r'Results\Table_classification.csv')

In [35]:
test_set_outer.shape

(1361,)

---

### **2.4 Stadistical Evaluation**

In [16]:
from itertools import combinations

def McNemar(models: list, X: np.array, y: np.array, k1: int, rs: int):
    kf1 = StratifiedKFold(k1, shuffle = True, random_state=rs)
    k = 0
    # setting up all the possible combinations between the different models
    matrix = dict.fromkeys(combinations(range(len(models)), 2))
    for train_idx, test_idx in kf1.split(X, y):
        test_size = test_idx.shape[0]
        yABC = np.empty(shape=(len(models), test_size))
        for i, model in enumerate(models):
            model.fit(X[train_idx,:], y[train_idx])
            y_pred = model.predict(X[test_idx, :])
            yABC[i, :] = 1*(y_pred == y[test_idx])
        for j in list(matrix.keys()):
            if k == 0:
                matrix[j] = np.empty(shape=(k1, 4))
            n11 = np.sum(yABC[j[0],:]*yABC[j[1],:])
            n12 = np.sum(yABC[j[0],:]*(1-yABC[j[1],:]))
            n21 = np.sum(yABC[j[1],:]*(1-yABC[j[0],:]))
            n22 = np.sum((1-yABC[0,:])*(1-yABC[1,:]))
            matrix[j][k] = np.array([n11, n12, n21, n22])
        k+=1
    return matrix

In [36]:
random_state = 1
models = [LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state= random_state, C = 0.07),
        DummyClassifier(strategy='most_frequent', random_state=random_state),
        MLPClassifier(solver='adam', activation='logistic', alpha=1e-4, random_state=random_state, max_iter=1000, hidden_layer_sizes=(8, ),
        early_stopping=True, validation_fraction=0.2, warm_start=True, verbose=False, learning_rate ='adaptive', learning_rate_init=0.01)]
k1 = 10
m = McNemar(models, X_stdz[test_set_outer, :], y_[test_set_outer], k1, rs = random_state)

We create the matrix with shape (3: number of models, 10: nº of k-folds, 4: matrix shape)

The matrix is squeezed so we have:
$$\begin{bmatrix}
 n_{11}    & n_{12}  \\
 n_{21}    & n_{22}
\end{bmatrix}$$

Here is $[ n_{11}, n_{12}, n_{21}, n_{22} ]$

$H_0:$ Model A has the same performarnce as model B 

$H_1:$ Model A and model B has different performance

small p_value-> we discard H0 -> Model A and Model B have different performance

Also we set:

**Model 0**: LR

**Model 1**: Baseline

**Model 2**: MLP


### ***P-Values***

In [37]:
combs = list(combinations(range(len(models)), 2))

In [None]:
from scipy.stats import binom
pv_dict = dict.fromkeys(combs)
for j in combs:
    vals = m[j][:, 1:3]
    pv_dict[j] = [binom.cdf(min(vals[i]), n = sum(vals[i]), p = 1/2) for i in range(len(vals))]

In [None]:
McNemar_pv = pd.DataFrame(columns = combs, index = range(10))

In [None]:
i = 0
for n in pv_dict.values():
    print(f'{combs[i]}')
    for j, k in enumerate(n):
        McNemar_pv[combs[i]][j] = "{:.3e}".format(k)
        print("{:.3e}".format(k))
    i+=1

In [86]:
McNemar_pv.to_csv('McNemar_pv.csv')

### Confidence intervals

In [91]:
from scipy.stats import beta

In [94]:
def calcs(mat: np.array):
    """Calculate f y g from a McNemar Matrix
    Args:
        matrix (np.array): McNemar matrix from one K-fold
    Returns:
        _type_: f and g
    """
    n = mat.sum()
    n12 = mat[1]
    n21 = mat[2]
    E_th = (n12 - n21)/n 
    Q = (n**2 * (n+1) * (E_th +1) * (1-E_th)) / (n * (n12 + n21) - (n12 - n21)**2)
    f = (Q-1)*(E_th+1)/2
    g = (Q-1)*(1-E_th)/2
    return f, g

In [104]:
def interval(mat: np.array, alpha: float):
    """McNemar confidence interval

    Args:
        alpha (float): The desired confidence (should be 0.05)  
        f (_type_): output of calcs
        g (_type_): output of calcs 

    Returns:
        _type_: left and right bounds from the interval
    """
    f,g = calcs(mat)
    theta_L = 2*beta.ppf(alpha, f, g) - 1
    theta_R = 2*beta.ppf(1 - alpha/2, f, g) - 1
    return theta_L, theta_R

In [107]:
for i in m:
    print(i)
    for mat in m[i]:
        theta_L, theta_R = interval(mat, 0.05)
        print(f'[{np.round(theta_L, 2)}, {np.round(theta_R, 2)}]')

(0, 1)
[0.58, 0.74]
[0.61, 0.75]
[0.58, 0.74]
[0.6, 0.74]
[0.64, 0.79]
[0.54, 0.69]
[0.57, 0.73]
[0.61, 0.76]
[0.57, 0.73]
[0.58, 0.75]
(0, 2)
[-0.01, 0.06]
[-0.0, 0.07]
[-0.03, 0.02]
[-0.02, 0.03]
[0.01, 0.06]
[-0.01, 0.06]
[-0.02, 0.02]
[-0.02, 0.02]
[-0.06, 0.01]
[-0.01, 0.03]
(1, 2)
[-0.71, -0.55]
[-0.73, -0.57]
[-0.73, -0.57]
[-0.73, -0.58]
[-0.77, -0.62]
[-0.67, -0.51]
[-0.72, -0.55]
[-0.75, -0.59]
[-0.73, -0.57]
[-0.73, -0.56]


### **2.5 Train logistic regression model**

So in the fourth exercise do we have to repeat the parameter selection process or can just go ahead with the best parameter selection for each model?