In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load Data

In [None]:
df = pd.read_csv("../input/clothessizeprediction/final_test.csv")
df

# Check Missing Value

In [None]:
df.isna().sum()

# Data Description

In [None]:
df.describe()

# Data Structures

In [None]:
df.info()

# Handling Missing Value

For this case, we will fill the missing data using `median`

In [None]:
df['age'] = df['age'].fillna(df['age'].median())
df['height'] = df['height'].fillna(df['height'].median())

In [None]:
df.isna().sum()

# Exploratory Data

In [None]:
fig, axes = plt.subplots(1,3,figsize=(20,5))
fig.suptitle('Predictor')

# weight
sns.histplot(df['weight'], ax = axes[0])
axes[0].set_title('weight')

# age
sns.histplot(df['age'], ax = axes[1])
axes[1].set_title('age')

# height
sns.histplot(df['height'], ax = axes[2])
axes[2].set_title('height')

check outlier using boxplot

In [None]:
fig, axes = plt.subplots(1,3,figsize=(20,5))
fig.suptitle('Predictor')

# weight
sns.boxplot(x = 'size',y = 'weight', data = df, ax = axes[0])
axes[0].set_title('weight')

# age
sns.boxplot(x = 'size',y = 'age', data = df, ax = axes[1])
axes[1].set_title('age')

# height
sns.boxplot(x = 'size',y = 'height', data = df, ax = axes[2])
axes[2].set_title('height')

Too many outliers in there, we need to remove the outliers

# Distribution of its target

In [None]:
df['size'].value_counts()

In [None]:
sns.countplot(x = 'size', data = df)

# Remove Outlier

In [None]:
# calculate zscore

from scipy import stats

z = np.abs(stats.zscore(df[['age','height','weight']]))
df = df.drop(np.unique(np.where(z > 3)[0]))
df = df.reset_index()
df = df.drop(columns=['index'], axis = 1)
df

In [None]:
fig, axes = plt.subplots(1,3,figsize=(20,5))
fig.suptitle('Predictor')

# weight
sns.boxplot(x = 'size',y = 'weight', data = df, ax = axes[0])
axes[0].set_title('weight')

# age
sns.boxplot(x = 'size',y = 'age', data = df, ax = axes[1])
axes[1].set_title('age')

# height
sns.boxplot(x = 'size',y = 'height', data = df, ax = axes[2])
axes[2].set_title('height')

Distribution of its target

In [None]:
df['size'].value_counts()

In [None]:
sns.countplot(x = 'size', data = df)

# Feature Engineering

In [None]:
df['bmi'] = df["height"]/df["weight"]
df["weight-squared"] = df["weight"] * df["weight"]
df = pd.concat([df['weight'],df['age'],df['height'],df['bmi'],df['weight-squared'],df['size']],axis = 1)
df

# Encode the Target

In [None]:
# Mapping clothes size from strings to numeric
df['size'] = df['size'].map({"XXS": 1,
                             "S": 2,
                             "M" : 3,
                             "L" : 4,
                             "XL" : 5,
                             "XXL" : 6,
                             "XXXL" : 7})
df.head()

# Check Correlation Plot

In [None]:
sns.heatmap(df.corr(), annot=True)

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

train,test = train_test_split(df,test_size = 0.2,random_state = 0)

In [None]:
train['size'].value_counts()

In [None]:
test['size'].value_counts()

In [None]:
X_train = train.iloc[:, :-1].values
y_train = train.iloc[:, -1].values
X_test = test.iloc[:, :-1].values
y_test = test.iloc[:, -1].values

# Balancing Data

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
oversample = SMOTE()
X_train,y_train = oversample.fit_resample(X_train,y_train)

In [None]:
from collections import Counter
Counter(y_train)

# Standarizing

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train

# Modeling

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

In [None]:
key = ['LogisticRegression','KNeighborsClassifier','SVC','DecisionTreeClassifier','RandomForestClassifier',
       'GradientBoostingClassifier','AdaBoostClassifier','XGBClassifier']
value = [LogisticRegression(solver='newton-cg'),KNeighborsClassifier(),SVC(C=.5, gamma = 0.1,kernel = 'rbf', random_state = 0),
         DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier(),AdaBoostClassifier(),xgb.XGBClassifier()]
models = dict(zip(key,value))
print(models)

In [None]:
predicted =[]
for name,algo in models.items():
    model=algo
    model.fit(X_train,y_train)
    predict = model.predict(X_test)
    acc = accuracy_score(y_test, predict)
    predicted.append(acc)
    print(name,acc)

In [None]:
plt.figure(figsize = (10,5))
sns.barplot(x = predicted, y = key)