# Clothes-Size-Prediction
## 1. Problem Definition

Context

The best way to avoid mistakes in buying clothes through online shops. You can check your predictions using this dataset.

Context

The best way to avoid mistakes in buying clothes through online shops. You can check your predictions using this dataset

Inspiration


The most important motivation behind this was to get the size predicted to avoid any problems while buying clothes.
Content

##  2. Data

Data available on Kaggle.  https://www.kaggle.com/tourist55/clothessizeprediction


## 3. Evaluation
> If we can reach 80% (?) accuracy at predicting the size of a person during the proof of concept, we'll pursue the project.

## 4. Features

The Dataset consists of 4 columns:
The output parameter is size and input parameters are Weight, Age, and Height.

* Weight (in kgs)
* Age
* Height (in cm)
* Size (Target)


# Preparing the tools


In [None]:

# Import all the tools we need

# Regular EDA (exploratory data analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# we want our plots to appear inside the notebook
%matplotlib inline 

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix


plt.style.use('ggplot')

# Load data

In [None]:
df = pd.read_csv('../input/clothessizeprediction/final_test.csv')
df.head()

In [None]:
df.isna().sum()

### replace NaN with mean

In [None]:
df['age'].fillna(df['age'].mean(), inplace=True)
df['height'].fillna(df['height'].mean(), inplace=True)
df.isna().sum()

In [None]:
df['size'].value_counts()

In [None]:
sns.boxplot(data=df['age']);

In [None]:
df['size'].value_counts().plot(kind='bar')
plt.title('Samples')
plt.xlabel('Sizes')
plt.ylabel('Amount');

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
for k in list(df['size'].unique()):
    series = df[df['size'] == k]
    scat = ax.scatter(series['weight'], series['height'], label=k)
plt.xlabel('weight')
plt.ylabel('height')
plt.title('Weight vs Height By Size')
plt.legend();

In [None]:
df['age'].plot.hist();

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
corr_matrix = df.corr()
corr_matrix

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");

# 5. Modelling

In [None]:
X = df.drop('size', axis=1)
y = df['size']

In [None]:
models = {
    'LogisticRegresion': LogisticRegression(solver='liblinear'),
    'KNN' : KNeighborsClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

In [None]:
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2)
baseline_models_score = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    baseline_models_score[name] = score
    print(f'Model {name} with score: {score}')

In [None]:
baseline_models_score

In [None]:
model_compare = pd.DataFrame(baseline_models_score, index=["accuracy"])
model_compare.T.plot.bar();
plt.yticks(np.arange(0,1.1,0.1));

# Tunning models

In [None]:
# LOGISTIC REGRESSION GRID

log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Create a hyperparameter grid for RandomForestClassifier
rf_grid = {"n_estimators": np.arange(10, 200, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": [2,4],
           "min_samples_leaf": [2,4,]}

# 1. LogisticRegression
## RandomizedSearchCV

In [None]:

np.random.seed(42)

# Setup random hyperparameter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)

rs_log_reg.fit(X_train, y_train)

rs_log_reg.score(X_test, y_test)

## GridSearchCV

In [None]:
np.random.seed(42)
grid_log_reg = GridSearchCV(LogisticRegression(), param_grid=log_reg_grid, cv=5, verbose=True)
grid_log_reg.fit(X_train,y_train)

In [None]:
grid_log_reg.score(X_test, y_test)

# KNN tunning

In [None]:
neighbors = range(1, 21, 2)
params = {'n_neighbors': neighbors}
rs_knn = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=params, verbose=True)
rs_knn.fit(X_train, y_train)

In [None]:
rs_knn.best_params_

In [None]:
rs_knn.score(X_test, y_test)

## RANDOM FOREST CLASSIFIER
## RandomSearchCV

In [None]:
rs_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=rf_grid, verbose=True)
rs_rf.fit(X_train, y_train)
rs_rf.score(X_test, y_test)

In [None]:
rs_rf.best_params_

## GridSearchCVC

In [None]:
gs_rf = GridSearchCV(RandomForestClassifier(), param_grid=rf_grid, verbose=True)
gs_rf.fit(X_train, y_train)
gs_rf.score(X_test, y_test)

In [None]:
gs_rf.best_params_

### USING GS_RF FOR METRICS

In [None]:
y_preds =gs_rf.predict(X_test)

In [None]:
confusion_matrix(y_test, y_preds)

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
plot_confusion_matrix(gs_rf, X_test, y_test, ax=ax)

In [None]:
print(classification_report(y_test,y_preds, zero_division=False))