In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

About Features

- weight (kg)
- age
- height (cm)
- size : Target to predict

In [None]:
df = pd.read_csv('../input/clothessizeprediction/final_test.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

The minimum values of 'weight' and 'height' are 22 and 137, while that of 'age' is **0**.

According to the Normal growth table from [Children's Wisconsin](https://childrenswi.org/medical-care/adolescent-health-and-medicine/issues-and-concerns/adolescent-growth-and-development/normal-growth), I'll drop the rows with 'age' under 8.

In [None]:
df = df[df['age'] >= 8]

In [None]:
df.describe()

In [None]:
df.isnull().sum()

There are some Null values in 'height' column. I'll drop the rows with Null values.

In [None]:
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
df.isnull().sum()

In [None]:
df.shape

# EDA

In [None]:
background_color = '#F8EDF4'
color_palette = ['#F78904', '#00C73C', '#D2125E', '#693AF9', '#B20600', '#007CDE', '#994936', '#886A00', '#39BBC2']

## Countplot of Target (Size)

In [None]:
fig = plt.figure(figsize=(15, 6))
gs = fig.add_gridspec(1, 2)
gs.update(hspace=0.2, wspace=0.1)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])

fig.patch.set_facecolor(background_color)
axes = [ax0, ax1]


# Title
ax0.text(0.5, 0.5, 'Countplot of Size\n_______________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=22, fontweight='bold', fontfamily='serif')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines[['bottom']].set_visible(False)
    
    
# Graph
sns.countplot(x='size', data=df, fill=True, ax=ax1, palette=color_palette, order=df['size'].value_counts().index)
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax1.set_xlabel('')
ax1.set_ylabel('')


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
df['size'].value_counts()

## Distribution of Weight

In [None]:
fig = plt.figure(figsize=(18, 16))
gs = fig.add_gridspec(3, 2)
gs.update(hspace=0.35, wspace=0.15)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])
ax4 = fig.add_subplot(gs[2, 0])
ax5 = fig.add_subplot(gs[2, 1])

fig.patch.set_facecolor(background_color)
axes = [ax0, ax1, ax2, ax3, ax4, ax5]

# Title1
ax0.text(0.5, 0.5, 'Boxplot of Weight\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph1
sns.boxplot(x='weight', data=df, ax=ax1, color=color_palette[0])


# Title2
ax2.text(0.5, 0.5, 'Distribution of Weight\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph2
sns.kdeplot(x='weight', data=df, fill=True, ax=ax3, color=color_palette[0])


# Title3
ax4.text(0.5, 0.5, 'Distribution of Weight by Size\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph3
sns.kdeplot(x='weight', hue='size', data=df, fill=True, ax=ax5, palette=color_palette[0:7])


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

for ax in [ax0, ax2, ax4]:
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params(left=False, bottom=False)
    ax.spines[['bottom']].set_visible(False)
        
for ax in [ax1, ax3, ax5]:
    ax.set_xlabel('')
    ax.set_ylabel('')
    if ax == ax1:
        ax.grid(color='#000000', linestyle=':', axis='x', zorder=0, dashes=(1,5))
    else:
        ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))

Unlike I've expected, it wasn't a good idea to plot 7 distributions together. It's hard to distinguish them.

So I'll split the whole data into two parts:
- df_L for 'L', 'XL', 'XXL', 'XXXL'
- df_S for 'XXS', 'S', 'M'

In [None]:
df_L = df.loc[(df['size'] == 'L') | (df['size'] == 'XL') | (df['size'] == 'XXL') | (df['size'] == 'XXXL')]
df_S = df.loc[(df['size'] == 'XXS') | (df['size'] == 'S') | (df['size'] == 'M')]

In [None]:
fig = plt.figure(figsize=(18, 11))
gs = fig.add_gridspec(2, 2)
gs.update(hspace=0.2, wspace=0.15)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])

fig.patch.set_facecolor(background_color)
axes = [ax0, ax1, ax2, ax3]


# Title1
ax0.text(0.5, 0.5, 'Distribution of Weight by Size 1\nL, XL, XXL, XXXL\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph1
sns.kdeplot(x='weight', hue='size', data=df_L, fill=True, ax=ax1, palette=color_palette[:4])


# Title2
ax2.text(0.5, 0.5, 'Distribution of Weight by Size 2\nXXS, S, M\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph2
sns.kdeplot(x='weight', hue='size', data=df_S, fill=True, ax=ax3, palette=color_palette[4:7])


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

for ax in [ax0, ax2]:
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params(left=False, bottom=False)
    ax.spines[['bottom']].set_visible(False)
        
for ax in [ax1, ax3]:
    ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    ax.set_xlabel('')
    ax.set_ylabel('')

## Distribution of Age

In [None]:
fig = plt.figure(figsize=(18, 16))
gs = fig.add_gridspec(3, 2)
gs.update(hspace=0.35, wspace=0.15)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])
ax4 = fig.add_subplot(gs[2, 0])
ax5 = fig.add_subplot(gs[2, 1])

fig.patch.set_facecolor(background_color)
axes = [ax0, ax1, ax2, ax3, ax4, ax5]

# Title1
ax0.text(0.5, 0.5, 'Boxplot of Age\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph1
sns.boxplot(x='age', data=df, ax=ax1, color=color_palette[1])


# Title2
ax2.text(0.5, 0.5, 'Distribution of Age\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph2
sns.kdeplot(x='age', data=df, fill=True, ax=ax3, color=color_palette[1])


# Title3
ax4.text(0.5, 0.5, 'Distribution of Age by Size\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph3
sns.kdeplot(x='age', hue='size', data=df, fill=True, ax=ax5, palette=color_palette[0:7])


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

for ax in [ax0, ax2, ax4]:
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params(left=False, bottom=False)
    ax.spines[['bottom']].set_visible(False)
        
for ax in [ax1, ax3, ax5]:
    ax.set_xlabel('')
    ax.set_ylabel('')
    if ax == ax1:
        ax.grid(color='#000000', linestyle=':', axis='x', zorder=0, dashes=(1,5))
    else:
        ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))

In [None]:
fig = plt.figure(figsize=(18, 11))
gs = fig.add_gridspec(2, 2)
gs.update(hspace=0.2, wspace=0.15)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])

fig.patch.set_facecolor(background_color)
axes = [ax0, ax1, ax2, ax3]


# Title1
ax0.text(0.5, 0.5, 'Distribution of Age by Size 1\nL, XL, XXL, XXXL\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph1
sns.kdeplot(x='age', hue='size', data=df_L, fill=True, ax=ax1, palette=color_palette[:4])


# Title2
ax2.text(0.5, 0.5, 'Distribution of Age by Size 2\nXXS, S, M\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph2
sns.kdeplot(x='age', hue='size', data=df_S, fill=True, ax=ax3, palette=color_palette[4:7])


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

for ax in [ax0, ax2]:
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params(left=False, bottom=False)
    ax.spines[['bottom']].set_visible(False)
        
for ax in [ax1, ax3]:
    ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    ax.set_xlabel('')
    ax.set_ylabel('')

## Distribution of Height

In [None]:
fig = plt.figure(figsize=(18, 16))
gs = fig.add_gridspec(3, 2)
gs.update(hspace=0.35, wspace=0.15)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])
ax4 = fig.add_subplot(gs[2, 0])
ax5 = fig.add_subplot(gs[2, 1])

fig.patch.set_facecolor(background_color)
axes = [ax0, ax1, ax2, ax3, ax4, ax5]

# Title1
ax0.text(0.5, 0.5, 'Boxplot of Height\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph1
sns.boxplot(x='height', data=df, ax=ax1, color=color_palette[2])


# Title2
ax2.text(0.5, 0.5, 'Distribution of Height\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph2
sns.kdeplot(x='height', data=df, fill=True, ax=ax3, color=color_palette[2])


# Title3
ax4.text(0.5, 0.5, 'Distribution of Height by Size\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph3
sns.kdeplot(x='height', hue='size', data=df, fill=True, ax=ax5, palette=color_palette[0:7])


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

for ax in [ax0, ax2, ax4]:
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params(left=False, bottom=False)
    ax.spines[['bottom']].set_visible(False)
        
for ax in [ax1, ax3, ax5]:
    ax.set_xlabel('')
    ax.set_ylabel('')
    if ax == ax1:
        ax.grid(color='#000000', linestyle=':', axis='x', zorder=0, dashes=(1,5))
    else:
        ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))

In [None]:
fig = plt.figure(figsize=(18, 11))
gs = fig.add_gridspec(2, 2)
gs.update(hspace=0.2, wspace=0.15)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])

fig.patch.set_facecolor(background_color)
axes = [ax0, ax1, ax2, ax3]


# Title1
ax0.text(0.5, 0.5, 'Distribution of Height by Size 1\nL, XL, XXL, XXXL\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph1
sns.kdeplot(x='height', hue='size', data=df_L, fill=True, ax=ax1, palette=color_palette[:4])


# Title2
ax2.text(0.5, 0.5, 'Distribution of Height by Size 2\nXXS, S, M\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph2
sns.kdeplot(x='height', hue='size', data=df_S, fill=True, ax=ax3, palette=color_palette[4:7])


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

for ax in [ax0, ax2]:
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params(left=False, bottom=False)
    ax.spines[['bottom']].set_visible(False)
        
for ax in [ax1, ax3]:
    ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    ax.set_xlabel('')
    ax.set_ylabel('')

## Correlation Matrix

In [None]:
# Label Encoding
df['size'] = df['size'].map({'XXS': 1, 'S': 2, 'M' : 3,
                             'L' : 4, 'XL' : 5, 'XXL' : 6, 'XXXL' : 7})

Instead of using LabelEncoder of sklearn, I mapped manually to set the order of sizes. (If you use LabelEncoder, it would be sorted alphabetically)

In [None]:
df.corr()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(8, 8))

mask = np.triu(np.ones_like(df.corr()))
ax.text(1.2, -0.1, 'Correlation Matrix', fontsize=18,
        fontweight='bold', fontfamily='serif')
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='RdBu', 
            square=True, mask=mask, linewidth=0.7, ax=ax)

# Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
models = []
scores = []

In [None]:
X = df.drop('size', axis=1)
y = df['size']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
model = LogisticRegression(solver='saga')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score_logreg = accuracy_score(y_test, y_pred)

print('Accuracy Score of Logistic Regression :', score_logreg)

models.append('Logistic Regression')
scores.append(score_logreg)

In [None]:
model = RandomForestClassifier(max_depth=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score_rf = accuracy_score(y_test, y_pred)

print('Accuracy Score of Random Forest Classifier :', score_rf)

models.append('Random Forest')
scores.append(score_rf)

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score_dt = accuracy_score(y_test, y_pred)

print('Accuracy Score of DecisionTree Classifier :', score_dt)

models.append('Decision Tree')
scores.append(score_dt)

In [None]:
# Find n_neighbors for best score
accuracy = []

for i in range(1, 11):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_pred))

score_knn = max(accuracy)
print('Accuracy Score of K-Nearest Neighbors Classifier :', score_knn)

plt.figure(figsize=(7, 4))
plt.plot(range(1,11), accuracy, linestyle='dashed', marker='o', color='blue',
        markersize=7, markerfacecolor='red')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()

models.append('K-Nearest Neighbors')
scores.append(score_knn)

# Result

In [None]:
df_result = pd.DataFrame({'Model' : models, 'Score' : scores})
df_result.sort_values(by='Score', ascending=False, inplace=True)
df_result

In [None]:
fig = plt.figure(figsize=(15, 4))
gs = fig.add_gridspec(1, 2)
gs.update(wspace=0.2)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])

# Title
ax0.text(0.5, 0.5, 'Score of Models\n ___________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontfamily='serif', fontweight='bold')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['left'].set_visible(False)


# Graph
ax1.grid(color='#000000', linestyle=':', axis='x', zorder=0, dashes=(1,5))
sns.barplot(x='Score', y='Model', data=df_result, palette=color_palette, ax=ax1)
ax1.set_xlabel('')
ax1.set_ylabel('')


fig.patch.set_facecolor(background_color)
axes = [ax0, ax1]

for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'bottom']:
        ax.spines[s].set_visible(False)

### Thank you!
### Please Upvote if you like my notebook üëç