In [None]:
!pip install stats
!pip install preprocessing
!pip install torchviz

Collecting stats
  Downloading stats-0.1.2a.tar.gz (127 kB)
[?25l[K     |██▋                             | 10 kB 22.8 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 29.1 MB/s eta 0:00:01[K     |███████▊                        | 30 kB 19.0 MB/s eta 0:00:01[K     |██████████▎                     | 40 kB 12.2 MB/s eta 0:00:01[K     |████████████▉                   | 51 kB 5.6 MB/s eta 0:00:01[K     |███████████████▍                | 61 kB 5.9 MB/s eta 0:00:01[K     |██████████████████              | 71 kB 4.9 MB/s eta 0:00:01[K     |████████████████████▌           | 81 kB 5.5 MB/s eta 0:00:01[K     |███████████████████████         | 92 kB 5.9 MB/s eta 0:00:01[K     |█████████████████████████▊      | 102 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████████▎   | 112 kB 5.2 MB/s eta 0:00:01[K     |██████████████████████████████▉ | 122 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████████████| 127 kB 5.2 MB/s 
[?25hBuilding wheels for 

# Intro
* Bajamos dataset
* Analizamos de manera basica el dataset (Cantidad de filas, columnas, tipo del label (regresion, clasificacion binaria, clasificacion multiclase), etc)
* Limpiamos el dataset (remover o hacer algunos operaciones sobre lso nans)
* Ingenieria de features (correr tests de correlacion, encontrar buenas features para predecir salidas, etc.)
* Normalizar los datos
* Crear un modelo base-line que me permita obtener metricas iniciales de un modelo basico (en general un modelo lineal)
* Crear modelos mas complejos como no lineales con deep-learning

## Load dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import torch
from scipy import stats
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pandas as pd

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Deep Learning/Clase_6/class_7_wine_dataset_v2.csv')

In [None]:
dataset.info()

In [None]:
dataset.head()

Objetivo: crear un modelo para predecir la calidad del vino basandonos en los parámetros medidos


## Data analysis of the output

In [None]:
quality = dataset.quality.to_numpy()
fig, ax1 = plt.subplots(1,1)
ax1.hist(quality)
print(f"Min value: {quality.min()}")
print(f"Max value: {quality.max()}")

* Regresión lineal: no se pone nada al final
* Clasificación binaria: se pone una sigmoid al final
* Clasificación multiclase: se pone un softmax al final

In [None]:
# Empezamos con un caso de clasificación binaria
# Agregamos una feature (output) binarizando el quality segun si es menor/igual a 6 o mayor
dataset['quality_label'] = dataset.quality.apply(lambda label: 0 if label <=6 else 1)
print(len(dataset[dataset['quality_label'] == 0]))
print(len(dataset[dataset['quality_label'] == 1]))

In [None]:
dataset.info()

## Clean dataset

In [None]:
nans = dataset[dataset.isna().sum(axis=1)==1]
print(nans.shape)

In [None]:
ds = nans = dataset[dataset.isna().sum(axis=1)==0]
print(ds.shape)

## Feature engineering

In [None]:
ds.info()

In [None]:
# Tengo que trasnformar todas las varaiabeles / columanas categoricas en algo que mi modelo peuda usar apra entrenar

#Tratamiento para la variales type
ds.type.unique() #Mapear type a 2 variabels one hor encoding

y = pd.get_dummies(ds.type, prefix="type")
y.info()

In [None]:
#Concatenar las dumies al dataset principal
ds = pd.concat([ds, y], axis=1)
ds.info()

In [None]:
corr = ds.loc[:, (ds.columns != 'vendor_id') & (ds.columns != 'type') & (ds.columns != 'quality_label')].corr()
fig, ax1 = plt.subplots(1, figsize=(18,10))
import seaborn as sns
sns.set(style="ticks", color_codes=True, font_scale=1.5)
sns.heatmap(corr, cmap=sns.diverging_palette(220, 10, as_cmap=True), annot=True, fmt = ".2f")
sns.set(font_scale=1.5)

In [None]:
fig, ax1 = plt.subplots(1, figsize=(18,10))
_ = ax1.boxplot([dataset.loc[dataset['quality_label'] == 0, 'alcohol'], 
                 dataset.loc[dataset['quality_label'] == 1, 'alcohol']])

In [None]:
#Nos vamos a quedar con las variables que vamos a usar para hacer el fit del modelo
vendor_id = ds['vendor_id']
final_data = ds.drop(['Unnamed: 0', 'quality_label', 'quality', 'vendor_id', 'type'], axis=1, inplace=False)
final_data.info()

In [None]:
final_data = final_data.to_numpy()
label = ds.quality_label.to_numpy()

## Normalizacion de datos


In [None]:
normalized_data = (final_data - np.min(final_data, axis=0)) / (np.max(final_data, axis=0)-np.min(final_data, axis=0))
print(np.max(normalized_data, axis=0))
print(np.min(normalized_data, axis=0))

## Dataset split

In [None]:
n = final_data.shape[0]
idx = np.random.permutation(np.arange(0, n))
train_idx = idx[:int(0.8 * n)]
test_idx = idx[int(0.8 * n):]
assert len(train_idx) + len(test_idx) == len(idx)

In [None]:
vendor_id = vendor_id.to_numpy()
unique, indices, inversa = np.unique(vendor_id, return_index=True, return_inverse=True)
vendor_id_to_index = {key: value for key, value in zip(unique, inversa)} # mapeo de indices para layer embeddings
vendor_index = np.array([vendor_id_to_index[id] for id in vendor_id])

In [None]:
vendor_index_train = vendor_index[train_idx]
X_train = normalized_data[train_idx,:]
y_train = label[train_idx]

vendor_index_test = vendor_index[test_idx]
X_test = normalized_data[test_idx,:]
y_test = label[test_idx]

In [None]:
print(vendor_index_train.shape)
print(X_train.shape)
print(y_train.shape)
print(vendor_index_test.shape)
print(X_test.shape)
print(y_test.shape)

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

lr_model = LogisticRegression(C=1, solver='sag', max_iter=1000)
lr_model.fit(X_train, y_train)

In [None]:
lr_test_scores = lr_model.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, lr_test_scores[:,1])
print(metrics.auc(fpr, tpr))

## Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=None, random_state=0)
rf_model.fit(X_train, y_train)

In [None]:
rf_test_score = rf_model.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, rf_test_score[:,1])
print(metrics.auc(fpr, tpr))

## Deep learning

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
class CustomDataset(Dataset):
  def __init__(self, X, Y):
    super().__init__()
    self.X = X
    self.Y = Y
  
  def __len__(self):
    return self.X.shape[0]
  
  def __getitem__(self, idx):
    return self.X[idx,:], self.Y[idx]

In [None]:
training = CustomDataset(X_train, y_train)
testing = CustomDataset(X_test, y_test)

In [None]:
print(len(training))
print(len(testing))

In [None]:
training_dataloader = DataLoader(training, batch_size=128, shuffle=True)
test_dataloader = DataLoader(testing, batch_size=128, shuffle=True)

In [None]:
class NNet(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.linear_1 = torch.nn.Linear(in_features=13, out_features=20, bias=True)
    self.relu_1 = torch.nn.ReLU()
    self.linear_2 = torch.nn.Linear(in_features=20, out_features=5, bias=True)
    self.relu_2 = torch.nn.ReLU()
    self.linear_3 = torch.nn.Linear(in_features=5, out_features=1, bias=True)
  
  def forward(self, x):
    z1 = self.linear_1(x)
    a1 = self.relu_1(z1)
    z2 = self.linear_2(a1)
    a2 = self.relu_2(z2)
    y = self.linear_3(a2)
    return y

In [None]:
nnet = NNet()

In [None]:
print(nnet)

In [None]:
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')
optimizer = torch.optim.SGD(nnet.parameters(), lr=0.001)
torch.cuda.is_available()

In [None]:
device = ""
if torch.cuda.is_available():
  device = "cuda:0"
else:
  device = "cpu"

nnet.to(device)

for epoch in range(100):

  running_loss = 0
  nnet.train()
  for i, data in enumerate(training_dataloader):

    # batch
    x, y = data
    x = x.to(device).float()
    y = y.to(device).float().reshape(-1,1)

    # set gradient to zero
    optimizer.zero_grad()

    # forward
    y_hat = nnet(x)

    # loss
    loss = criterion(y_hat, y)

    # backward
    loss.backward()

    # update of parameters
    optimizer.step()

    # compute metrics and statistics
    running_loss += loss.item()
  
  nnet.eval()
  nnet_test_score = []
  truth = []
  for i, data in enumerate(test_dataloader):
    # batch
    x, y = data
    x = x.to(device).float()
    y = y.to(device).float().reshape(-1,1)

    # forward
    y_hat = nnet(x)

    # accumulate data
    truth += list(y)
    nnet_test_score += list(y_hat)

  fpr, tpr, thresholds = metrics.roc_curve(y_test, rf_test_score[:,1])
  auc = metrics.auc(fpr, tpr)
  print(f"Epoch = {epoch} | loss = {running_loss / len(training)} | auc = {auc}")

## Deep learning con Embeddings

In [None]:
class CustomDatasetWithEmbedding(Dataset):
  def __init__(self, X, vendor_idx, Y):
    super().__init__()
    self.X = X
    self.vendor_idx = vendor_idx
    self.Y = Y
  
  def __len__(self):
    return self.X.shape[0]
  
  def __getitem__(self, idx):
    return self.X[idx,:], self.vendor_idx[idx], self.Y[idx]

In [None]:
training = CustomDatasetWithEmbedding(X_train, vendor_index_train, y_train)
testing = CustomDatasetWithEmbedding(X_test, vendor_index_test, y_test)

In [None]:
training_dataloader = DataLoader(training, batch_size=128, shuffle=True)
test_dataloader = DataLoader(testing, batch_size=128, shuffle=True)

In [None]:
class NNet(torch.nn.Module):
    def __init__(self, number_of_vendors, embedding_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(num_embeddings=number_of_vendors, embedding_dim=embedding_dim)
        self.linear_1 = torch.nn.Linear(in_features=(13 + embedding_dim), out_features=200, bias=True)
        self.relu_1 = torch.nn.ReLU()
        self.linear_2 = torch.nn.Linear(in_features=200, out_features=100, bias=True)
        self.relu_2 = torch.nn.ReLU()
        self.linear_3 = torch.nn.Linear(in_features=100, out_features=1, bias=True)
    
    def forward(self, x, vendor_idx):
        vendor_emb = self.embedding(vendor_idx)
        final_input = torch.cat([x, vendor_emb], dim=1)
        z1 = self.linear_1(final_input)
        a1 = self.relu_1(z1)
        z2 = self.linear_2(a1)
        a2 = self.relu_2(z2)
        y = self.linear_3(a2)
        return y

In [None]:
nnet = NNet(number_of_vendors=len(unique), embedding_dim=16)

In [None]:
print(nnet)

In [None]:
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum') # criterion, is my lost function
optimizer = torch.optim.Adam(nnet.parameters(), lr=0.01) # is my optimizer

In [None]:
device = ""
if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

nnet.to(device)

for epoch in range(20):

    running_loss = 0
    nnet.train()
    for i, data in enumerate(training_dataloader):

        # batch
        x, vendor_idx, y = data

        #vendor_idx = vendor_idx.reshape(-1,1)
        x = x.to(device).float()
        y = y.to(device).float().reshape(-1,1)

        # set gradient to zero
        optimizer.zero_grad()

        # forward 
        y_hat = nnet(x, vendor_idx)

        # loss
        loss = criterion(y_hat, y)

        # backward
        loss.backward()

        # update of parameters
        optimizer.step()

        # compute metrics and statistics
        running_loss += loss.item()
    
    nnet.eval()
    with torch.no_grad():
        nnet_test_scores = []
        truth = []
        for i, data in enumerate(test_dataloader):
            # batch
            x, vendor_idx, y = data
            x = x.to(device).float()
            y = y.to(device).float().reshape(-1,1)

            # forward 
            y_hat = nnet(x, vendor_idx)
            y_hat = torch.sigmoid(y_hat)

            # accumulate data
            truth += list(y.detach().numpy()) 
            nnet_test_scores += list(y_hat.detach().numpy())

        fpr, tpr, thresholds = metrics.roc_curve(truth, nnet_test_scores)
        auc = metrics.auc(fpr, tpr)
        print(f"Epoch = {epoch} | loss = {running_loss / len(training)} | auc = {auc}")