In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
%matplotlib inline

# Make Plotly work in your Jupyter Notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected = True)

import cufflinks as cf

# Use Plotly locally
cf.go_offline()

# About data
1. Age: in years
2. Weight: in kg
3. Height: in cm

## Read the data from the file

In [None]:
df = pd.read_csv("/kaggle/input/clothessizeprediction/final_test.csv")
df.head()

## See the information about the dtypes
Dtypes tell us what is the data column like. It also help us to determine if the column has correct dtype or not.

In [None]:
df.info()

From above we can see that some values in age, height column are null in nature.

## Describe the statistics of the column.
The describe function help us to see the integer/float value column statistics. By adding the keyword 'all' we can see the statistics of object column also. 

In [None]:
df.describe().T

From the table we can see some values for min and max range of age, weight which could be considered as outliers depending on other parameters.

# Age

## Displaying unique values

Let us see what are the different unique values present in the age column.

In [None]:
age_unique = df["age"].unique()
print("Unique values in age column:\n", age_unique)

The above array contain minimum value of nan, 0 which we can say is of child who is not yet 1 year old and maximum value of 117 which means the person is very old. But it is clear that these values according to their cloth size may not be correct.

## Plotting to see outliers

In [None]:
fig = px.box(df, x = "size", y = "age", color = "size")

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Size",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Age",
        titlefont = dict(size = 12)
    ),
    title = "<b>Outlier detection in case of age vs size</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")


fig.show()

In above figure we can see outliers correctly thus I am using trimming method to eliminate the outliers. While using this method it is kept in mind that not too much data is removed.

Seeing from the above plot it is clear that most of the outliers are less than 10 or greater than 80.
Assuming that people above 80 and below 10 years of age are not going for too much online shopping I removed those rows which has value in specified range.

## Dropping the rows in specific range

In [None]:
index_age = df[(df['age'] >= 80)|(df['age'] <= 10)].index
df.drop(index_age, inplace=True)
df['age'].describe()

## Plotting to see result of trimming

In [None]:
fig = px.box(df, x = "size", y = "age", color = "size")

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Size",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Age",
        titlefont = dict(size = 12)
    ),
    title = "<b>Plot after removal of outlier</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")


fig.show()

## Analyzing missing values

Let us see if there is any null value in the age column after removing outlier rows.

In [None]:
age_nan = df[df["age"].isna()]
age_nan

As we can see that null values are present thus they need to be handled properly. In order to achieve this I will replace the null value according to the size of the clothes. That is the average value of size "L" will replace the nan presesent in age column corresponding to the "L" size.

## Calculating the mean

In order to achieve the result I grouped the rows according to the size and calculated the mean of the age.

In [None]:
age_mean = df.groupby("size")["age"].mean().round()
age_mean

## Replacing missing values with the mean

Let us create the function to replace the nan value. In the function three parameters are passed namely dataframe, size and mean age value. 

In [None]:
def missing_age(df, size, mean_age):
    
    """
    Select the value of age column corresponding to the size and nan value. 
    Return the average value according to size.
    """
    
    df.loc[(df["size"] == size) & (df["age"].isnull()), "age"] = mean_age
    
    return df

In [None]:
missing_age(df, "L", 34)
missing_age(df, "M", 33)
missing_age(df, "S", 33)
missing_age(df, "XL", 35)
missing_age(df, "XXL", 36)
missing_age(df, "XXS", 32)
missing_age(df, "XXXL", 36)

The nan values in the age column are replaced by the mean values. 

## Plotting to see age distribution

After removing the outliers and replacing the nan value with mean value let us see the distribution of age.

In [None]:
fig = px.histogram(df, x = "age",
                  color_discrete_sequence = ["#ff1c49"] * len(df))

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Age",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Age Count",
        titlefont = dict(size = 12)
    ),
    title = "<b>Distribution of age </b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")


fig.show()

The above plot shows that the 29-30 years is the maximum age and the plot is skewed towards the right.

# Height

## Unique values
Let us see the list of unique values present in the height column

In [None]:
height_unique = df["height"].unique()
height_unique

137.16 is the minimum value while 193.04 is the maximum value. nan is also present in the column. Let us see the distribution of height according to the clothing size.

## Plotting to see outliers

In [None]:
fig = px.box(df, x = "size", y = "height", color = "size")

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Size",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Height",
        titlefont = dict(size = 12)
    ),
    title = "<b>Outlier detection in case of height vs size</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")


fig.show()

After vizualizing the above plot we see that outliers lies below 144 cm and above 180 cm. Let us remove these outiers to make our data more clean.

## Dropping the rows in specific range

In [None]:
height_index = df[(df['height'] >= 180)|(df['height'] <= 144)].index
df.drop(height_index, inplace=True)
df['height'].describe()

## Plotting to see result of trimming

In [None]:
fig = px.box(df, x = "size", y = "height", color = "size")

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Size",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Height",
        titlefont = dict(size = 12)
    ),
    title = "<b>Plot after removal of outlier</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")




fig.show()

## Analyzing the missing values

Let us replace the nan values in height column. The same procedure as above is followed to replace the values.

In [None]:
height_missing = df[df["height"].isna()]
height_missing

## Calculating the mean

In [None]:
height_mean = df.groupby("size")["height"].mean()
height_mean

## Replacing missing values with the mean

In [None]:
def missing_height(df, size, mean_height):
    
    """
    Selcting the rows according to size and null values in height column.
    Return the average value accoring to size.
    """
    
    df.loc[(df["size"] == size) & (df["height"].isnull()), "height"] = mean_height
    
    return df

In [None]:
missing_height(df, "L", 166.51)
missing_height(df, "M", 165.28)
missing_height(df, "S", 163.66)
missing_height(df, "XL", 167.10)
missing_height(df, "XXL", 159.04)
missing_height(df, "XXS", 161.29)
missing_height(df, "XXXL", 167.01)

## Plotting to see height distribution
After removing outliers and replacing nan values let us see the height distribution.

In [None]:
fig = px.histogram(df, x = "height",
                  color_discrete_sequence = ["#d21cff"] * len(df))


fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Height",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Height Count",
        titlefont = dict(size = 12)
    ),
    title = "<b>Distribution of height </b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")



fig.show()

A spike is present at the height of 162 cm. Also some count for height is very low.

# Weight

## Displaying the unique values

In [None]:
weight_unique = df["weight"].unique()
weight_unique

The minimum value is 22kg and maximum is 136 kg. There is no nan value present in the weight column.

## Plotting the weight to see outliers

In [None]:
fig = px.box(df, x = "size", y = "weight", color = "size")

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Size",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Weight",
        titlefont = dict(size = 12)
    ),
    title = "<b>Outlier detection in case of weight vs size</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")



fig.show()

From above it is clear that XXXL has distribution which is different from other sizes for the weight. Let us take 40kg as minimum and 90kg as maximum value for the outliers.

In [None]:
weight_index = df[(df['weight'] >= 90)|(df['weight'] <= 40)].index
df.drop(weight_index, inplace=True)
df['weight'].describe()

## Plotting to see result of trimming

In [None]:
fig = px.box(df, x = "size", y = "weight", color = "size")

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Size",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Weight",
        titlefont = dict(size = 12)
    ),
    title = "<b>Plot after removal of outlier</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")


fig.show()

As there is no nan value in this column let us now see the distribution of weight.

## Weight distribution

In [None]:
fig = px.histogram(df, x = "weight", color_discrete_sequence = ["#ffd735"] * len(df))


fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Weight",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Weight Count",
        titlefont = dict(size = 12)
    ),
    title = "<b>Distribution of weight </b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

fig.show()

# Checking Again

In [None]:
df.info()

In [None]:
df.describe().T

# Histogram showing the variation of size with height and weight

In [None]:
fig = px.histogram(df, x= "height", y = "weight", color = "size")

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Height(cm)",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Weight(kg)",
        titlefont = dict(size = 12)
    ),
    title = "<b>Variation of size with height and weight</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

fig.show()

Histogram showing height, weight and size of clothing required. From this we can indeed see the curve for each size.

# Sizes
Let us see the most common size purchsed by users.

In [None]:
fig = px.histogram(df, x = "size", color = "size")

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Size",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Count",
        titlefont = dict(size = 12)
    ),
    title = "<b>Most popular size</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")


fig.show()

From above we can see that the Medium size is the most popular.

# Scatter matrix to see the relation between features and labels

In [None]:
fig = px.scatter_matrix(df, dimensions=["age", "weight", "height"], color="size",
                       labels = {
                           "age": "Age",
                           "weight":"Weight",
                           "height":"Height"
                       })

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    title = "<b>Scatter Matrix showing relation between the features</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.show()

# Heat map

In [None]:
import plotly.figure_factory as ff
corr = df.corr()

fig = ff.create_annotated_heatmap(z=corr.to_numpy(), 
                                  x=corr.columns.tolist(),
                                  y=corr.columns.tolist(),
                                  colorscale=px.colors.sequential.Purp,
                                  hoverinfo="none", #Shows hoverinfo for null values
                                  showscale=True, ygap=1, xgap=1
                                 )

fig.update_xaxes(side="bottom")

fig.update_layout(
    title_text='<b>Heatmap</b>', 
    title_x=0.5, 
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    yaxis_autorange='reversed',
    template='plotly_white'
)

fig.show()

From above heat map we can see their is hardly any correlation between the features.

# Label Prediction
## Encoding the label

Since our label are object we should encode them to numbers. This can be easily done with the help of label encoder in sklearn.

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df["size_num"] = labelencoder.fit_transform(df["size"])

After label encoding we will use the OneHotEncoder from sklearn. The label encoder may confuse our model that the label number are in certain order. For example is label number 3 is in any relationship with 4 such as 3 < 4. To remove such confusion we will provide each label a value of 1 but seperate columns will be formed for each of them.

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

df_enc = pd.DataFrame(enc.fit_transform(df[['size']]).toarray())

df_enc.columns = enc.get_feature_names(['size'])
print(df_enc.columns)
df_enc.head()

From the data frame df_enc we can see column is formed for each size.

We will take both type of encoded colum and see how our model behave to each of them.

### Convert the data into array

In [None]:
data = df.values

X = data[:, :-2]

y_label = data[:, -1]
y_ohe = df_enc.values

In [None]:
X = X.astype("int")
X.dtype

In [None]:
y_label = y_label.astype("int")
y_label.dtype

In [None]:
y_ohe = y_ohe.astype("int")
y_ohe.dtype

# Split the data

In [None]:
from sklearn.model_selection import train_test_split

# Split 
X_train, X_test, y_train, y_test = train_test_split(X,y_label,test_size=0.2)
X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(X,y_ohe,test_size=0.2)


In [None]:
print(f'''The shape of label encoded : 
train data is:{X_train.shape}, {y_train.shape} 
test data is: {X_test.shape}, {y_test.shape}.''')

In [None]:
print(f'''The shape of OneHotEncoded:
train data is: {X_train_ohe.shape}, {y_train_ohe.shape}
test data is: {X_test_ohe.shape}, {y_test_ohe.shape}.''')

# XGBOOST

We will use XGBoost classification algorithm and see the result for both type of encoded data.

## For label encoded values

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

pars_l = {
        'colsample_bytree': 0.8,                 
        'learning_rate': 0.08,
        'max_depth': 10,
        'subsample': 1,
        'objective':'multi:softprob',
        'num_class':7,
        'eval_metric':'mlogloss',
        'min_child_weight':3,
        'gamma':0.25,
        'n_estimators':5
    }


# create XGBoost instance with default hyper-parameters
xgb_estimator = xgb.XGBClassifier(pars = pars_l)

# fit the model
xgb_estimator.fit(X_train, y_train)

In [None]:
# evaluate on test data
y_pred = xgb_estimator.predict(X_test)
print('Accuracy on test data: {:.1f}%'.format(accuracy_score(y_test, y_pred)*100))

In [None]:
# The histogram of scores compared to true labels
fig = px.histogram(
    x=y_pred, color=y_test, 
    labels=dict(color='True Labels', x='Predicted Label')
)


fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Predicted Labels",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Count",
        titlefont = dict(size = 12)
    ),
    title = "<b>Comaprison of accuracy using Label Encoded Values</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")


fig.show()

## For OneHotEncoded Values

In [None]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

pars_ohe = {
        'colsample_bytree': 0.8,                 
        'learning_rate': 0.08,
        'max_depth': 10,
        'subsample': 1,
        'objective':'binary:logistic',
        'num_class':7,
        'eval_metric':'mlogloss',
        'min_child_weight':3,
        'gamma':0.25,
        'n_estimators':5
    }


# create XGBoost instance with default hyper-parameters
xgb_estimator = xgb.XGBClassifier(pars = pars_ohe)

# create MultiOutputClassifier instance with XGBoost model inside
multilabel_model = MultiOutputClassifier(xgb_estimator)

# fit the model
multilabel_model.fit(X_train_ohe, y_train_ohe)

In [None]:
# evaluate on test data
y_pred_ohe = multilabel_model.predict(X_test_ohe)
print('Accuracy on test data: {:.1f}%'.format(accuracy_score(y_test_ohe,y_pred_ohe)*100))

Clearly for our dataset label encoder works better than one hot encoder.

# Outlier correction using sklearn

## Read the data

In [None]:
df_out = pd.read_csv("/kaggle/input/clothessizeprediction/final_test.csv")

### Fill nan values

In [None]:
df_null = df_out.groupby("size").mean().round()
df_null

In [None]:
def missing_value(df, size,value, mean_value):
    
    """
    Selcting the rows according to size and null values in height column.
    Return the average value accoring to size.
    """
    
    df.loc[(df["size"] == size) & (df[value].isnull()), value] = mean_value
    
    return df

In [None]:
missing_value(df_out, "XXS", "age", 32.0)
missing_value(df_out, "S", "age", 33.0)
missing_value(df_out, "M", "age", 33.0)
missing_value(df_out, "L", "age", 34.0)
missing_value(df_out, "XL", "age", 35.0)
missing_value(df_out, "XXL", "age", 36.0)
missing_value(df_out, "XXXL", "age", 36.0)

missing_value(df_out, "XXS", "height", 161.0)
missing_value(df_out, "S", "height", 164.0)
missing_value(df_out, "M", "height", 165.0)
missing_value(df_out, "L", "height", 167.0)
missing_value(df_out, "XL", "height", 168.0)
missing_value(df_out, "XXL", "height", 160.0)
missing_value(df_out, "XXXL", "height", 168.0)

## Check the dataframe again

In [None]:
df_out.info()

## Label Encode data

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df_out["size_num"] = labelencoder.fit_transform(df_out["size"])

In [None]:
df_out.head()

## Data

### Convert data into array

In [None]:
df_out_value = df_out.values
X, y = df_out_value[:, :-2], df_out_value[:, -1]

### Check the astype. 
It should be int type.

In [None]:
X = X.astype("int")
y = y.astype("int")

### Split the data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=1)

### Check the shape

In [None]:
X_train.shape, y_train.shape

## IsolationForest Outlier detection

In [None]:
from sklearn.ensemble import IsolationForest

# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)

# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]

# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

Around 9580 rows are removed from the dataset

## XGBoost Classifier

In [None]:
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

pars_l = { 'colsample_bytree': 0.8,
           'learning_rate': 0.08, 
           'max_depth': 10, 
           'subsample': 1, 
           'objective':'multi:softprob', 
           'num_class':7, 
           'eval_metric':'mlogloss', 
           'min_child_weight':3, 
           'gamma':0.25, 
           'n_estimators':5 }

# create XGBoost instance with default hyper-parameters
xgb_estimator = xgb.XGBClassifier()

# fit the model
xgb_estimator.fit(X_train, y_train)

In [None]:
# evaluate on test data
from sklearn.metrics import accuracy_score

# evaluate the model
y_pred = xgb_estimator.predict(X_test)
print('Accuracy on test data: {:.1f}%'.format(accuracy_score(y_test, y_pred)*100))

There is a slight improvement in accuracy. But we can say model is not providing us with good accuracy.

In [None]:
# The histogram of scores compared to true labels
fig = px.histogram(
    x=y_pred, color=y_test, 
    labels=dict(color='True Labels', x='Predicted Label')
)


fig.update_layout(
    plot_bgcolor = "#ECECEC",
    autosize = True,
    xaxis = dict(
        title_text = "Predicted Labels",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "Count",
        titlefont = dict(size = 12)
    ),
    title = "<b>Comaprison of accuracy using Isolation Forest</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
)


fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")


fig.show()

The above model shows us that it is not efficient and much has to be done to improve its efficiency such as hyperparameter tuning etc.
If you find the work good please upvote