# **🗃️ Data Lab**

Useful datasets:
- [Fruits dataset](https://drive.google.com/file/d/1Jn15Qra1NldKC6ELVTTFWrqZ5OJdK5pL/view?usp=sharing)
- [Car maintainance](https://drive.google.com/file/d/1t4mKvvw6VR6Bx4OwZ8SeMRd18kd7hvz9/view?usp=sharing)
- [Energy consumption](https://drive.google.com/file/d/1iMkGEG4TBUw0FreaT4Gf2gnm3MEjFG1x/view?usp=sharing)
- [Students Performance](https://drive.google.com/file/d/1YwqZvaf0B7gW0cutfgD7berjkBSRVzPk/view?usp=sharing)
- [Heart Disease](https://drive.google.com/file/d/1lQ-3-dmVpJBq0eXcQp3nRgKNnn-TAP_n/view?usp=sharing)
- [Medical insurance](https://drive.google.com/file/d/1n_An4atBisD6FlO8k467Iz2sZjsujVF5/view?usp=sharing)
- [Phone price dataset](https://drive.google.com/file/d/1zh7byFuQo8Wg-Wzpu9TGo2VzhqCR6y7K/view?usp=sharing)

## Generate samples 🎯

In [None]:
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression

# @markdown \

# ============
# Parameters
# ============

n_samples = 300 # @param {type:"integer"}
type_dataset = "linear_reg" # @param ["noisy_circles", "noisy_moons", "blobs", "no_structure", "anisotropic", "varied_var", "linear_reg", "nonlinear_reg"]
noise = 0.19 # @param {type:"slider", min:0, max:0.5, step:0.01}
angle_aniso = 110 # @param {type:"slider", min:0, max:180, step:10}
deg_reg = 3 # @param {type:"slider", min:1, max:10, step:1}
random_state = 1 # @param {type:"integer"}
return_classes = False # @param {type:"boolean"}



# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============

if type_dataset == "noisy_circles":
  X, y = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=noise, random_state=random_state)

elif type_dataset == "noisy_moons":
  X, y = datasets.make_moons(n_samples=n_samples, noise=noise, random_state=random_state)

elif type_dataset == "blobs":
  X, y = datasets.make_blobs(n_samples=n_samples, cluster_std=noise*5, random_state=random_state)
  X += np.random.rand(n_samples, 2)*noise*X.min()

elif type_dataset == "no_structure":
  X = np.random.rand(n_samples, 2)

elif type_dataset == "anisotropic":
  X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
  t = np.tan(np.radians(angle_aniso))
  transformation = np.array(((1, t), (0, 1))).T
  X = np.dot(X, transformation)
  X += np.random.rand(n_samples, 2)*noise*X.min()

elif type_dataset == "varied_var":
  X, y = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state)
  X += np.random.rand(n_samples, 2)*noise*X.min()

elif type_dataset == "linear_reg":
  X, y = datasets.make_regression(n_samples = n_samples , n_features = 1, noise = noise * 100, random_state = random_state)
  X = np.c_[X, y]

elif type_dataset == "nonlinear_reg":
  X, y = datasets.make_regression(n_samples = n_samples , n_features = 1, noise = noise * 100, random_state = random_state)
  y = (y * 0.1) ** deg_reg
  X = np.c_[X, y]


X = StandardScaler().fit_transform(X)

_, ax = plt.subplots(figsize=(5,4))
if return_classes:
  ax.scatter(X[:, 0], X[:, 1], c = y, edgecolors='k', cmap='Paired')
else:
  ax.scatter(X[:, 0], X[:, 1], edgecolors='k')

print ("\nData shape: {0} \n".format(X.shape))

## Load a dataset 📑

In [None]:
# @markdown ---

# @markdown \
# @markdown ### 🔼 Upload your file (first)
# @markdown \

# @markdown ---
# @markdown ### Enter path to **.csv* file:
file_path = "" # @param {type:"string"}
delimiter = "," # @param {type:"string"}

var_h = "" # @param {type:"string"}
var_v = "" # @param {type:"string"}
labels = "" # @param {type:"string"}
normalization = "None" # @param ["MinMax [0,1]", "MinMax [-1,1]", "Z-Score", "None"]
Load_all_data = False # @param {type:"boolean"}
Remove_missing = True # @param {type:"boolean"}

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv(file_path, delimiter=delimiter)

if Remove_missing:
  data = data.dropna()

if not Load_all_data:
  X = np.c_[np.array(data[var_h]), np.array(data[var_v])]
else:
  X = np.array(data)

y = np.array(data[labels]) if labels != "" else None

if   normalization == "MinMax [0,1]":
  X = MinMaxScaler(feature_range=( 0,1)).fit_transform(X)
elif normalization == "MinMax [-1,1]":
  X = MinMaxScaler(feature_range=(-1,1)).fit_transform(X)
elif normalization == "Z-Score":
  X = StandardScaler().fit_transform(X)


if not Load_all_data:
  data[var_h], data[var_v] = X[:, 0], X[:, 1]
else:
  for i in range(len(data.columns)):
    data.iloc[:, i] = X[:, i]


_, ax = plt.subplots (figsize=(5,4))
# ax.scatter(X[:, 0], X[:, 1], c=y, cmap='Paired', edgecolors='k')
sns.scatterplot(ax=ax,data=data,x=var_h,y=var_v, hue=labels if labels != "" else None, palette='colorblind')
print ("\nData Loaded! ✅")
print (" - Shape: {0}\n".format(X.shape))

# **📋 Model Lab**

## Classification

In [None]:
from sklearn import svm, linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import naive_bayes
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

# @markdown \

is_regression = False
model = "SVM" # @param ["LinearRegression", "LogisticRegression", "SVM", "kNN", "Bayes", "Decision Tree"]

# @markdown ---
# @markdown \
# @markdown ### ✏️ Additional Params
# @markdown \

# @markdown \
# @markdown #### ⚙️ SVM
# @markdown \

C = 1 # @param {type:"number"}
gamma = 5 # @param {type:"number"}
degree = 3 # @param {type:"integer"}
kernel = "rbf" # @param ["linear", "poly", "rbf", "sigmoid"]

# @markdown \
# @markdown #### ⚙️ kNN
# @markdown \

num_neighbors = 3 # @param {type:"integer"}

# @markdown \
# @markdown #### ⚙️ Naive Gaussian
# @markdown \

dist = "Gaussian" # @param ["Gaussian", "Bernoulli", "Multinomial"]

####### ----------------------------

X = StandardScaler().fit_transform(X)

if model == "LinearRegression":
  algo = linear_model.LinearRegression()

if model == "LogisticRegression":
  algo = linear_model.LogisticRegression()

elif model == "SVM":
  algo = svm.SVC(kernel=kernel, degree=degree, gamma=gamma, C=C)

elif model == "kNN":
  algo = KNeighborsClassifier(n_neighbors=num_neighbors)

elif model == "Bayes":
  if dist == "Gaussian":
    algo = naive_bayes.GaussianNB()
  elif dist == "Bernoulli":
    algo = naive_bayes.BernoulliNB()
  elif dist == "Multinomial":
    algo = naive_bayes.MultinomialNB()

elif model == "Decision Tree":
  algo = DecisionTreeClassifier()



print ("\nModel is ready!⚙️🔧\n")
# print ("----- ")
print (" - Model: {0}".format(model))

if model == "SVM":
  print (" - kernel: {0}".format(kernel))
  if kernel == 'poly':
    print (" - degree: {0}".format(degree))
  print (" - gamma: {0}".format(gamma))
  print (" - C: {0}".format(C))

if model == "kNN":
  print (" - Num neighbors: {0}".format(num_neighbors))

if model == "Bayes":
  print (" - Distribution: {0}".format(dist))

## Regression

In [None]:
from sklearn import svm, linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn import naive_bayes
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import StandardScaler

# @markdown \

is_regression = True
model = "LinearRegression" # @param ["LinearRegression", "SVR", "kNR", "Bayesian Ridge", "Kernel Ridge"]

# @markdown ---
# @markdown \
# @markdown ### ✏️ Aditional Params
# @markdown \

# @markdown \
# @markdown #### ⚙️ SVR
# @markdown \

C = 1 # @param {type:"number"}
gamma = 1 # @param {type:"number"}
degree = 3 # @param {type:"integer"}
kernel = "rbf" # @param ["linear", "poly", "rbf", "sigmoid"]

# @markdown \
# @markdown #### ⚙️ kNR
# @markdown \

num_neighbors = 3 # @param {type:"integer"}
weights = "uniform" # @param ["uniform", "distance"]

# @markdown \
# @markdown #### ⚙️ Kernel Ridge
# @markdown \

C = 1 # @param {type:"number"}
gamma = 1 # @param {type:"number"}
degree = 3 # @param {type:"integer"}
kernel = "rbf" # @param ["linear", "poly", "rbf", "sigmoid"]

####### ----------------------------

X = StandardScaler().fit_transform(X)

if model == "LinearRegression":
  algo = linear_model.LinearRegression()

elif model == "SVR":
  algo = svm.SVR(kernel=kernel, degree=degree, gamma=gamma, C=C)

elif model == "kNR":
  algo = KNeighborsRegressor(n_neighbors=num_neighbors, weights=weights)

elif model == "Bayesian Ridge":
  algo = linear_model.BayesianRidge()

elif model == "Kernel Ridge":
  algo = KernelRidge(kernel=kernel, degree=degree, gamma=gamma)



print ("\nModel is ready!⚙️🔧\n")
# print ("----- ")
print (" - Model: {0}".format(model))

if model == "SVR":
  print (" - kernel: {0}".format(kernel))
  if kernel == 'poly':
    print (" - degree: {0}".format(degree))
  print (" - gamma: {0}".format(gamma))
  print (" - C: {0}".format(C))

if model == "kNR":
  print (" - Num neighbors: {0}".format(num_neighbors))
  print (" - Weights: {0}".format(num_neighbors))

# **🤖 Run training!**

In [None]:
from itertools import cycle, islice

# @markdown ### Start now 🦾
# @markdown \

_, ax = plt.subplots(figsize=(5,4))
ax.set_title("{0}".format(model))

print ("\nTraining done! ✅")

if not is_regression:
  algo.fit(X, y)
  pred = algo.predict(X)
  colors = np.array(list(islice(cycle(["#377eb8", "#ff7f00", "#4daf4a", "#f781bf", "#a65628", \
                                     "#984ea3", "#999999", "#e41a1c", "#dede00", "#000000"]), int(max(pred) + 1),)))

  ax.scatter(X[:, 0], X[:, 1], c=colors[pred], edgecolors='k')
  print ("Plots... \n")
else:
  algo.fit(X[:, :-1], X[:, -1:])
  pred = algo.predict(X[:, :-1])
  X_, pred_ = zip(*sorted(zip(X[:, 0], pred)))
  ax.scatter(X[:, 0], X[:, 1], edgecolors='k')
  ax.plot(X_, pred_, c='r')
  print ("Plots... \n")


In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, mean_squared_error

# @markdown ### 📊 Compute metrics
# @markdown \
# @markdown #### Classification

compute_confusion_matrix = False # @param {type:"boolean"}
compute_metrics_report = False # @param {type:"boolean"}

# @markdown \

# @markdown #### Regression
compute_mae_mse = True # @param {type:"boolean"}


if compute_confusion_matrix:
    #
    predictions = algo.predict(X)
    cm_ = confusion_matrix(y, predictions, normalize='true')

    _, ax = plt.subplots (figsize=(5,3))
    sns.heatmap(cm_, annot=True, cmap='hot', ax= ax)

if compute_metrics_report:
    #
    print(classification_report(y, predictions))

if compute_mae_mse:
    #
    predictions = algo.predict(X[:, :-1])
    mae_ = mean_absolute_error(X[:, :-1], predictions)
    mse_ = mean_squared_error (X[:, :-1], predictions)
    print("MAE: {0:0.5f}".format(mae_))
    print("MSE: {0:0.5f}".format(mse_))

# **🌳 Code Lab - Coming back to roots**

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier as kNN

In [None]:
knn = kNN(num_neighbors = _ )

# Train kNN
_

In [None]:
# Create predictions
predictions = knn.predict( _ )
print (predictions.shape)

In [None]:
_, axes = plt.subplots (1,2, figsize=(8,4))

# plot data and labels
axes[1].scatter( _ , _ , c = _ , edgecolors='k', cmap='Paired')