In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
clothes = pd.read_csv('/kaggle/input/clothessizeprediction/final_test.csv')

In [None]:
clothes.head()

In [None]:
# Let check the basic info about the dataset
clothes.describe()

In [None]:
clothes.info()

In [None]:
# Let's check the number of null values present in the data
clothes.isnull().sum()

In [None]:
# since it is a fairly large dataset with 119734 rows,dropping these null values will have
# very minimum effect on the model 
clothes.dropna(inplace=True)

In [None]:
# Let's check if the value of age,weight or height is 0
print(f"There are {len(clothes[clothes['age']==0])} rows with value of age as 0")
print(f"There are {len(clothes[clothes['weight']==0])} rows with values of weight as 0")
print(f"There are {len(clothes[clothes['height']==0])} rows with values of height as 0")

In [None]:
# Let's remove the 18 rows as a customer can not have age as 0
age_is_0 = clothes[clothes['age']==0]
clothes.drop(age_is_0.index,inplace=True)

In [None]:
# importing the libraries for visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Let's check what are the different size of clothes present and how many cutomers are wearing them
clothes['size'].unique()

In [None]:
order = ['XXS','S','M','L','XL','XXL','XXXL']
sns.set_style('darkgrid')
sns.countplot(x='size',data=clothes,palette='Spectral',order=order)

There are very few cutomers who wear XXL size clothes

In [None]:
# Let's see the relation between age,weight and height of customers and size of their clothes
# Size vs Age
sns.boxplot(x='size',y='age',data=clothes,order=order)

In [None]:
# Size vs Weight
sns.boxplot(x='size',y='weight',data = clothes,order=order)

In [None]:
# Size vs Height
sns.boxplot(x='size',y='height',data= clothes,order=order)

Weight has the most impact on the size of clothes. With increase in weight, size of the clothes increased.

In [None]:
# Let's check the correlation between the columns: weight,age,height
clothes_corr = clothes.corr()
sns.heatmap(clothes_corr,annot=True,cmap='YlOrBr')

In [None]:
# Next step would be to divide the data into X which will be used as input values
# and y which will be used as output value
X = clothes.drop('size',axis=1).values
y = clothes['size'].values

In [None]:
# Lets check what type of values are present in y
y

In [None]:
# datatype of y is object which ML model will not be able to understand and process.
# We will use LabelEncoder and to_categorical methods of sklearn and keras respectively to
# do one hot encoding
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [None]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_encoded = to_categorical(y_encoded)
y_encoded

In [None]:
# Let's split the data into train set and test set with 25% of data as test_set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y_encoded,test_size=0.25,random_state = 42)

In [None]:
# Next task would be to do feature scaling on X_train and X_test data
# will use MinMaxScaler method of sklearn module for feature scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# will only perform fit method to X_train and not to X_test to avoid data leakage
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# ANN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# creating the model with 2 hidden layer and 7 nodes in each layers
model = Sequential()

# 1st hidden layer
model.add(Dense(7,activation='relu',input_shape=[3]))

# 2nd hidden layer
model.add(Dense(7,activation='relu'))

# output layer will have 7 nodes as there are 7 different sizes present
model.add(Dense(7,activation='softmax'))

# compiling the model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics='accuracy')

# model summary
model.summary()

In [None]:
# earlystop is used to avoid overfitting of the model on training data.It is used to monitor
# the performence of the model during training.It allows us to choose the large number of epochs 
# and stop training once the model performence stop improving on the validation data
early_stop = EarlyStopping(monitor='val_loss',mode='min',verbose = 1, patience=20)

In [None]:
# fitting the model to training data
model.fit(X_train_scaled,y_train,validation_data=(X_test_scaled,y_test),epochs=300,callbacks=[early_stop])

**Model Evaluation**

In [None]:
metrics = pd.DataFrame(model.history.history)
metrics.head()

In [None]:
# loss vs val_loss
metrics[['loss','val_loss']].plot()

In [None]:
# accuracy vs val_accuracy
metrics[['accuracy','val_accuracy']].plot()

In [None]:
# Lets check the accuracy for X_test
model.evaluate(X_test_scaled,y_test)

**Accuracy of the model is 52.22%**