# Assignment 3 - Option 2
## Ruan Buhr - 26440873

In [56]:
import numpy as np
import tensorflow as tf
import pandas as pd

from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.datasets import mnist, cifar10

from sklearn.datasets import load_iris, load_wine, load_digits, fetch_california_housing, load_diabetes, make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, mean_absolute_error

import math
import random
from collections import Counter

## Utilities

In [38]:
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)

## Datasets

### Classification: Iris

In [39]:
iris = load_iris()
df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
df_iris['target'] = iris.target

In [49]:
print(df_iris.head(), df_iris.shape, df_iris.describe(), df_iris.isna().sum(), sep="\n\n\n")

   sepal length (cm)  sepal width (cm)  ...  petal width (cm)  target
0                5.1               3.5  ...               0.2       0
1                4.9               3.0  ...               0.2       0
2                4.7               3.2  ...               0.2       0
3                4.6               3.1  ...               0.2       0
4                5.0               3.6  ...               0.2       0

[5 rows x 5 columns]


(150, 5)


       sepal length (cm)  sepal width (cm)  ...  petal width (cm)      target
count         150.000000        150.000000  ...        150.000000  150.000000
mean            5.843333          3.057333  ...          1.199333    1.000000
std             0.828066          0.435866  ...          0.762238    0.819232
min             4.300000          2.000000  ...          0.100000    0.000000
25%             5.100000          2.800000  ...          0.300000    0.000000
50%             5.800000          3.000000  ...          1.300000    1.000000

### Classification: MNIST

In [45]:
(Xtr, ytr), (Xte, yte) = mnist.load_data()
X = np.vstack([Xtr.reshape(len(Xtr), -1), Xte.reshape(len(Xte), -1)])
y = np.concatenate([ytr, yte])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


In [46]:
df_mnist = pd.DataFrame(X, columns=[f"px{i}" for i in range(X.shape[1])])
df_mnist["target"] = y

In [50]:
print(df_mnist.head(), df_mnist.shape, df_mnist.describe(), df_mnist.isna().sum(), sep="\n\n\n")

   px0  px1  px2  px3  px4  px5  ...  px779  px780  px781  px782  px783  target
0    0    0    0    0    0    0  ...      0      0      0      0      0       5
1    0    0    0    0    0    0  ...      0      0      0      0      0       0
2    0    0    0    0    0    0  ...      0      0      0      0      0       4
3    0    0    0    0    0    0  ...      0      0      0      0      0       1
4    0    0    0    0    0    0  ...      0      0      0      0      0       9

[5 rows x 785 columns]


(70000, 785)


           px0      px1      px2  ...    px782    px783        target
count  70000.0  70000.0  70000.0  ...  70000.0  70000.0  70000.000000
mean       0.0      0.0      0.0  ...      0.0      0.0      4.452429
std        0.0      0.0      0.0  ...      0.0      0.0      2.890195
min        0.0      0.0      0.0  ...      0.0      0.0      0.000000
25%        0.0      0.0      0.0  ...      0.0      0.0      2.000000
50%        0.0      0.0      0.0  ...      0.0      0.0    

### Classification: CIFAR-10

In [52]:
(Xtr, ytr), (Xte, yte) = cifar10.load_data()
ytr, yte = ytr.ravel(), yte.ravel()
X = np.vstack([Xtr.reshape(len(Xtr), -1), Xte.reshape(len(Xte), -1)])
y = np.concatenate([ytr, yte])
df_cifar = pd.DataFrame(X, columns=[f"px{i}" for i in range(X.shape[1])])
df_cifar["target"] = y

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 0us/step


In [53]:
print(df_cifar.head(), df_cifar.shape, df_cifar.describe(), df_cifar.isna().sum(), sep="\n\n\n")

   px0  px1  px2  px3  px4  px5  ...  px3067  px3068  px3069  px3070  px3071  target
0   59   62   63   43   46   45  ...     118      84     123      92      72       6
1  154  177  187  126  137  136  ...     134     142     143     133     144       9
2  255  255  255  253  253  253  ...      85      83      80      86      84       9
3   28   25   10   37   34   19  ...      56      37      72      65      46       4
4  170  180  198  168  178  196  ...      75      78      73      77      80       1

[5 rows x 3073 columns]


(60000, 3073)


                px0           px1  ...        px3071        target
count  60000.000000  60000.000000  ...  60000.000000  60000.000000
mean     130.681517    136.054483  ...    114.539400      4.500000
std       73.399021     72.871161  ...     66.008946      2.872305
min        0.000000      0.000000  ...      0.000000      0.000000
25%       71.000000     77.000000  ...     63.000000      2.000000
50%      128.000000    135.000000  ...    106

### Regression: California Housing

In [None]:
X, y = make_regression(n_samples=10000, n_features=10, noise=1.0, random_state=42)

df_syn = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
df_syn["target"] = y

In [60]:
print(df_syn.head(), df_syn.shape, df_syn.describe(), df_syn.isna().sum(), sep="\n\n\n")

         f0        f1        f2  ...        f8        f9      target
0 -2.946904 -0.429991  0.888988  ...  0.220842  0.539951   71.003189
1  1.154356 -0.946784  0.481741  ...  0.412254 -0.095042  -48.831110
2 -0.834083 -0.527566  0.065442  ...  0.215530  1.059958 -100.904069
3 -0.732383  0.388215  1.512235  ... -1.679208 -2.741489 -164.577952
4 -0.998797 -0.685263  0.650593  ...  0.120906  0.415295  -23.248771

[5 rows x 11 columns]


(5000, 11)


                f0           f1  ...           f9       target
count  5000.000000  5000.000000  ...  5000.000000  5000.000000
mean      0.004768    -0.014755  ...    -0.014441    -3.253507
std       0.993223     0.995310  ...     1.000081   165.005013
min      -3.307900    -3.631539  ...    -3.635200  -563.550670
25%      -0.652975    -0.691543  ...    -0.700048  -112.440317
50%       0.009642     0.001213  ...    -0.019127    -4.092725
75%       0.674679     0.651014  ...     0.672435   109.175343
max       3.377768     4.479084  ...     3.6

In [54]:
housing = fetch_california_housing()
df_housing = pd.DataFrame(housing.data, columns=housing.feature_names)
df_housing["target"] = housing.target

In [55]:
print(df_housing.head(), df_housing.shape, df_housing.describe(), df_housing.isna().sum(), sep="\n\n\n")

   MedInc  HouseAge  AveRooms  AveBedrms  ...  AveOccup  Latitude  Longitude  target
0  8.3252      41.0  6.984127   1.023810  ...  2.555556     37.88    -122.23   4.526
1  8.3014      21.0  6.238137   0.971880  ...  2.109842     37.86    -122.22   3.585
2  7.2574      52.0  8.288136   1.073446  ...  2.802260     37.85    -122.24   3.521
3  5.6431      52.0  5.817352   1.073059  ...  2.547945     37.85    -122.25   3.413
4  3.8462      52.0  6.281853   1.081081  ...  2.181467     37.85    -122.25   3.422

[5 rows x 9 columns]


(20640, 9)


             MedInc      HouseAge  ...     Longitude        target
count  20640.000000  20640.000000  ...  20640.000000  20640.000000
mean       3.870671     28.639486  ...   -119.569704      2.068558
std        1.899822     12.585558  ...      2.003532      1.153956
min        0.499900      1.000000  ...   -124.350000      0.149990
25%        2.563400     18.000000  ...   -121.800000      1.196000
50%        3.534800     29.000000  ...   -118.49000

### Regression: Diabetes

In [57]:
diab = load_diabetes()
df_diab = pd.DataFrame(diab.data, columns=diab.feature_names)
df_diab["target"] = diab.target

In [58]:
print(df_diab.head(), df_diab.shape, df_diab.describe(), df_diab.isna().sum(), sep="\n\n\n")

        age       sex       bmi        bp  ...        s4        s5        s6  target
0  0.038076  0.050680  0.061696  0.021872  ... -0.002592  0.019907 -0.017646   151.0
1 -0.001882 -0.044642 -0.051474 -0.026328  ... -0.039493 -0.068332 -0.092204    75.0
2  0.085299  0.050680  0.044451 -0.005670  ... -0.002592  0.002861 -0.025930   141.0
3 -0.089063 -0.044642 -0.011595 -0.036656  ...  0.034309  0.022688 -0.009362   206.0
4  0.005383 -0.044642 -0.036385  0.021872  ... -0.002592 -0.031988 -0.046641   135.0

[5 rows x 11 columns]


(442, 11)


                age           sex  ...            s6      target
count  4.420000e+02  4.420000e+02  ...  4.420000e+02  442.000000
mean  -2.511817e-19  1.230790e-17  ...  1.130318e-17  152.133484
std    4.761905e-02  4.761905e-02  ...  4.761905e-02   77.093005
min   -1.072256e-01 -4.464164e-02  ... -1.377672e-01   25.000000
25%   -3.729927e-02 -4.464164e-02  ... -3.317903e-02   87.000000
50%    5.383060e-03 -4.464164e-02  ... -1.077698e-03  140.50000