In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

f = ""

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        f = os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Input the Data set

In [None]:
data = pd.read_csv(f)
print(data)

# Data Cleaning

1. Check null values

In [None]:
print(data.isnull().sum())

2. Fill in the missing values via mean (missing values are numerical)

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
X = data[['weight','age','height']]
X = imp.fit_transform(X)
print(X)

3. (Optional) Convert categorical class to numeric - label encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

y = np.array(data[['size']]).ravel()
le = LabelEncoder()
le_y = le.fit_transform(y)
y_classes = le.classes_
print("Encoded labels: " + str(le_y))
print("Class labels: " + str(le.classes_))

# Deep Learning Classification

In [None]:
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import Callback
from keras.optimizers import SGD

X_train, X_test, y_train, y_test = train_test_split(X,le_y, test_size=0.33, random_state=42)

#Prevent overfitting
early_stopping = keras.callbacks.EarlyStopping(
min_delta=0.001,
patience=5,
restore_best_weights=True)

#Build model
model = keras.Sequential([
    layers.BatchNormalization(input_shape=[3]), #Inputs=weights,age,height
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.25),
    
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.25),
    
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.25),
    
    layers.Dense(len(y_classes), activation='softmax') #Multi-class classification
])

#Optimizer + Loss
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    # optimizer=SGD(lr=0.005),
    metrics=['accuracy'],
)

#Train model
history = model.fit(
X_train, y_train,
validation_data=(X_test,y_test),
batch_size=512,
epochs=50,
callbacks=[early_stopping],
)

history_df = pd.DataFrame(history.history)
history_df.loc[:,['loss', 'val_loss']].plot(title="Sparse Categorical Cross-entropy")
history_df.loc[:, ['accuracy', 'val_accuracy']].plot(title=('Accuracy'))



In [None]:
print("Max validation accuracy: " + str(np.max(history_df.loc[:,['val_accuracy']])))
print("Min validation loss: " + str(np.min(history_df.loc[:,['val_loss']])))

## Baseline

1. 3 layers
2. dropout=0.3
3. relu
4. adam
5. batch norm
6. 128 neurons
7. batch_size=512,
8. epochs=50,

|Accuracy|Validation loss|
|--------|---------------|
|0.518285| 1.121909      |

## Changed

|Parameters|Accuracy|Validation loss|
|----------|--------|---------------|
| elu | 0.51502 | 1.136109 |
| dropout=0.25| 0.518437 | 1.122018 |
| dropout=0.5| 0.517627 | 1.122494|
| 4 layers | 0.517652 | 1.122734|
| 64,32 neurons | 0.519019 | 1.124153|
| 32,16 neurons | 0.516311 | 1.127735|
| 64 neurons| 0.518488 | 1.127987| 
| 128 neurons | 0.515906 | 1.128714 |
| 64.64 | 0.516918 | 1.125243|
| 64,32,16| 0.517172 | 1.126445|
| 64,32 no batch norm | 0.518007 | 1.127214|

# Machine Learning Classification

0. Split into testing and training sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=0)

print(X_train)
print(y_train)

print(X_test)
print(y_test)

1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(solver="liblinear").fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred)
print("Accuracy for Logistic Regression: %.2f" % accuracy_lr)

2. Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred)
print("Accuracy for Naive Bayes: %.2f" % accuracy_nb)

3. Stoachastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss='modified_huber', shuffle=True, random_state=0)
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
accuracy_sgd = accuracy_score(y_test, y_pred)
print("Accuracy for Stochastic Gradient Descent: %.2f" % accuracy_sgd)

4. K-Nearest Neighbour

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred)
print("Accuracy for Random Forest: %.2f" % accuracy_knn)

5. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(max_depth=5, random_state=0, max_features=None, min_samples_leaf=5)
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred)
print("Accuracy for Decision Tree: %.2f" % accuracy_dt)

6. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfm = RandomForestClassifier(n_estimators=50, oob_score=True, n_jobs=3, random_state=0, max_features=None, min_samples_leaf=15)
rfm.fit(X_train, y_train)
y_pred = rfm.predict(X_test)
accuracy_rfm = accuracy_score(y_test, y_pred)
print("Accuracy for Random Forest: %.2f" % accuracy_rfm)

7. Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel="linear", C=1, random_state=0)
svm.fit(X_train,y_train)
y_pred=svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred)
print("Accuracy for Support Vector Machine: %.2f" % accuracy_svm)

# Evaluation

In [None]:
import matplotlib.pyplot as plt

labels = ['LR', 'NB', 'KNN', 'DT', 'RF', 'SVM']
accuracies = [accuracy_lr, accuracy_nb, accuracy_knn, accuracy_dt, accuracy_rfm, accuracy_svm]

x = [0,1,2,3,4,5]
width=0.35

fig, ax = plt.subplots()
ax.bar(x=labels,height=accuracies)

ax.set_ylabel('Accuracy')
ax.set_title('Classification Accuracy of ML models')
ax.set_xticks(x)
ax.set_xticklabels(labels)

plt.show()