### Importing the libraries

In [1]:
import pandas as pd
import numpy as np

### Importing the dataset

In [2]:
dataset = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Clothes Size Prediction/final_test.csv")

In [3]:
print(dataset.head())

   weight   age  height size
0      62  28.0  172.72   XL
1      59  36.0  167.64    L
2      61  34.0  165.10    M
3      65  27.0  175.26    L
4      62  45.0  172.72    M


In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

#### Check for NaN under an entire Dataset

In [5]:
dataset.isnull().values.any()

True

#### Count the NaN under an entire Dataset

In [6]:
dataset.isnull().sum().sum()

587

In [7]:
dataset.isna().any()

weight    False
age        True
height     True
size      False
dtype: bool

#### Taking care of missing data

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

#### Encoding the dependent variable

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
print(set(y))

{0, 1, 2, 3, 4, 5, 6}


#### Splitting the dataset into training and test set

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

#### Feature Scalling

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training the model on the training set

In [13]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

#### Making confusion matrix

In [14]:
from sklearn.metrics import accuracy_score
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.5069179588541521

In [15]:
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

In [17]:
# define grid search
parameters = dict(n_estimators=n_estimators,max_features=max_features)

In [18]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy')
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 50.56 %
Best Parameters: {'max_features': 'sqrt', 'n_estimators': 1000}


### Predicting 

In [19]:
y_pred = classifier.predict(X_test)

In [20]:
pred = (classifier.predict(sc.transform([[58, 24, 172]]))) ### W, A, H

In [21]:
predictions_test = le.inverse_transform(pred)
print(predictions_test)

['XXS']


In [22]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [1 1]
 ...
 [5 5]
 [2 2]
 [0 1]]
