In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [None]:
df = pd.read_csv('../input/clothessizeprediction/final_test.csv')
df.shape

In [None]:
df.isna().sum()

# **Dropping NaN values**

In [None]:
df.dropna(how='any', inplace=True)
df.shape

In [None]:
df.isna().sum()

In [None]:
sns.pairplot(data=df, hue='size', height=7)
plt.show()


In [None]:
plt.style.use('seaborn')
sns.countplot(x=df['size'])
plt.show()

In [None]:
df.head()

In [None]:
df['size'].value_counts()

# **Oulier Removal Z-score**

In [None]:
dfs = []
sizes = []
for size_type in df['size'].unique():
    print('size type:',size_type)
    sizes.append(size_type)
    ndf = df[['age','height','weight']][df['size'] == size_type]
    zscore = ((ndf - ndf.mean())/ndf.std())
    dfs.append(zscore)

# **Removing Outliers**

In [None]:
for i in range(len(dfs)):
    print(sizes[i])
    dfs[i]['age'] = dfs[i]['age'][(dfs[i]['age']>-3) & (dfs[i]['age']<3)]
    dfs[i]['height'] = dfs[i]['height'][(dfs[i]['height']>-3) & (dfs[i]['height']<3)]
    dfs[i]['weight'] = dfs[i]['weight'][(dfs[i]['weight']>-3) & (dfs[i]['weight']<3)]

In [None]:
for i in range(len(sizes)):
    dfs[i]['size'] = sizes[i]

In [None]:
new_df = pd.concat(dfs)

In [None]:
new_df.head()

# **No Outliers**

In [None]:
new_df['age'][new_df['age']<-3]

In [None]:
new_df['height'][new_df['height']<-3]

In [None]:
new_df['weight'][new_df['weight']<-3]

In [None]:
plt.style.use('seaborn')
sns.countplot(x=new_df['size'])
plt.show()

In [None]:
sns.pairplot(data=new_df, hue='size', height=7)
plt.show()

In [None]:
new_df['size'].value_counts()

# **Removing XXL size because of less count**

In [None]:
new_df2 = new_df[new_df['size'] != 'XXL'].copy()

In [None]:
new_df2.dropna(how='any', inplace=True)

In [None]:
plt.style.use('seaborn')
sns.countplot(x=new_df2['size'])
plt.show()

In [None]:
sns.pairplot(data=new_df2, hue='size', height=7)
plt.show()

In [None]:
new_df2['size'].unique()

In [None]:
size_code = {
    'XL':0,
    'L':1,
    'M':2,
    'S':3,
    'XXS':4,
    'XXXL':5
}
new_df2['size'].replace(size_code, inplace=True)

In [None]:
new_df2['size'].value_counts()

# **Splitting and training**

In [None]:
x, y = new_df2.drop('size', axis=1), new_df2['size']

In [None]:
x.shape,y.shape

In [None]:
x_train, x_test, y_train,y_test = train_test_split(x, y, test_size=0.25)

In [None]:
x_train.shape, x_test.shape

In [None]:
y_train.shape, y_test.shape

In [None]:
sns.countplot(x=y_train)
plt.show()

In [None]:
sns.countplot(x=y_test)
plt.show()

# **KNN Model**

In [None]:
KNN_model2 = KNeighborsClassifier(n_neighbors=7, metric='manhattan', weights='distance')
KNN_model2.fit(x_train, y_train)

In [None]:
KNN_model2.score(x_test, y_test)

In [None]:
KNN_model2.score(x_train, y_train)

In [None]:
y_pred = KNN_model2.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
y_pred_train = KNN_model2.predict(x_train)

In [None]:
print(classification_report(y_train, y_pred_train))

# **94% accuracy**

# **Saving model as file.**

In [None]:
from joblib import dump
dump(KNN_model2, 'Cloth-size-predictor')

In [None]:
!ls