In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler


# Loading Dataset

In [None]:
df_raw = pd.read_csv("../input/clothessizeprediction/final_test.csv")
df_raw.head()

# Understanding and Visualizing Data

In [None]:
df_raw.describe()

In [None]:
df_raw.isna().sum()

In [None]:
df_raw['age'] = df_raw['age'].fillna(df_raw['age'].median())
df_raw['height'] = df_raw['height'].fillna(df_raw['height'].mean())

In [None]:
df_raw['size'].value_counts()

In [None]:
plt.hist(df_raw["age"], color="b", alpha=0.5, rwidth=.9);

In [None]:
plt.hist(df_raw["height"], color="g", alpha=0.5,rwidth=.9);

In [None]:
plt.hist(df_raw['size'], color ='r', alpha = 0.4, rwidth=.9)

# Outlier Removal

In [None]:
dfs = []
sizes = []
for size_type in df_raw['size'].unique():
    sizes.append(size_type)
    ndf = df_raw[['age','height','weight']][df_raw['size'] == size_type]
    zscore = ((ndf - ndf.mean())/ndf.std())
    dfs.append(zscore)

In [None]:
for i in range(len(dfs)):
    dfs[i]['age'] = dfs[i]['age'][(dfs[i]['age']>-3) & (dfs[i]['age']<3)]
    dfs[i]['height'] = dfs[i]['height'][(dfs[i]['height']>-3) & (dfs[i]['height']<3)]
    dfs[i]['weight'] = dfs[i]['weight'][(dfs[i]['weight']>-3) & (dfs[i]['weight']<3)]

In [None]:
for i in range(len(sizes)):
    dfs[i]['size'] = sizes[i]
df = pd.concat(dfs)
df.head()

In [None]:
df['size'] = df['size'].map({'XXS': 1, 'S': 2, "M" : 3, "L" : 4, "XL" : 5, "XXL" : 6, "XXXL" : 7})

In [None]:
df.describe()

In [None]:
df["height"].fillna(df["height"].mean(),inplace=True)
df["age"].fillna(df["age"].median(),inplace=True)
df["weight"].fillna(df["weight"].mean(),inplace=True)

# Feature Engineering

In [None]:
df["bmi"] = df["height"]/df["weight"]
df["weight_sq"] = df["weight"]*df["weight"]

In [None]:
ax = sns.heatmap(df.corr(), annot=True)

In [None]:
#df.drop("age", axis=1, inplace=True)
#df.drop("height", axis=1, inplace=True)
#df.drop("bmi", axis=1, inplace=True)
#df.drop("weight_sq", axis=1, inplace=True)

# Splitting into training and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df.drop('size', axis=1),df['size'],test_size=0.001)

In [None]:
# transfer = StandardScaler()
# X_train = transfer.fit_transform(X_train)
# X_test = transfer.transform(X_test)

# Training Model

In [None]:
lr = LinearRegression()
lr.fit(X_train,Y_train)
lr_score = lr.score(X_test,Y_test)

In [None]:
from sklearn.metrics import accuracy_score
xgb = XGBRegressor()
xgb.fit(X_train, Y_train)
y_pred = xgb.predict(X_test)
predictions = [round(value) for value in y_pred]
xgb_score = accuracy_score(Y_test, predictions)

In [None]:
knn = KNeighborsClassifier(n_neighbors=7, metric='manhattan', weights='distance')
knn.fit(X_train,Y_train)
knn_score = knn.score(X_test,Y_test)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train,Y_train)
dt_score = dt.score(X_test,Y_test)

In [None]:
scores=[lr_score,xgb_score,knn_score,dt_score]
labels=["LinearRegression","XGB","KNN","DecisionTree"]
print(scores)

In [None]:
sns.barplot(labels,scores, orient="v");
plt.ylabel("Accuracy")
plt.xticks(rotation=40)
plt.title("Model");