## Libraries

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix, classification_report

## Settings

In [None]:
pd.set_option('display.max_columns', None)
SEED = 12345

## Importing Data

In [None]:
train_df = pd.read_csv('../input/mobile-price-classification/train.csv')
test_df = pd.read_csv('../input/mobile-price-classification/test.csv')

In [None]:
train_df.shape, train_df.columns

In [None]:
test_df.shape, test_df.columns

In [None]:
train_df.dtypes

In [None]:
train_df.sample(5)

In [None]:
FEATURES = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
        'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
        'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
        'touch_screen', 'wifi']

CAT_FEATURES = ['n_cores', 'blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']

CONT_FEATURES = [col for col in FEATURES if col not in CAT_FEATURES]

DEP_VARS = ['price_range']

In [None]:
train_df['n_cores'].unique()

In [None]:
train_df['n_cores'].nunique()

## Data preprocessing

In [None]:
train_df[CAT_FEATURES].head()

In [None]:
# Categorical variables for train data
cat_encoding_dict = {}
for col in CAT_FEATURES:
    label_enc = LabelEncoder()
    train_df[col] = label_enc.fit_transform(train_df[col])
    cat_encoding_dict[col] = label_enc

In [None]:
# Categorical variables for test data
for col in CAT_FEATURES:
    test_df[col] = cat_encoding_dict[col].transform(test_df[col])

In [None]:
cat_encoding_dict['n_cores'].classes_

In [None]:
train_df[CAT_FEATURES].head()

In [None]:
train_df[CONT_FEATURES].head()

In [None]:
# Continuous Scaling
standard_enc = StandardScaler()
train_df[CONT_FEATURES] = standard_enc.fit_transform(train_df[CONT_FEATURES])

In [None]:
train_df[CONT_FEATURES].head()

In [None]:
standard_enc.mean_, standard_enc.scale_, standard_enc.var_

In [None]:
# Continous scaling for test data
test_df[CONT_FEATURES] = standard_enc.transform(test_df[CONT_FEATURES])

In [None]:
train_df.head()

In [None]:
train_df[DEP_VARS].value_counts() # balanced and properly label encoded

### train validation split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df[FEATURES].values,
                                                    train_df[DEP_VARS[0]].values,
                                                    test_size=0.2,
                                                    random_state=SEED)

In [None]:
{
    "X_train":X_train.shape,
    "X_val":X_val.shape,
    "y_train":y_train.shape,
    "y_val":y_val.shape
}

### Logisitic Regression

In [None]:
lm = LogisticRegression(multi_class='ovr', solver='liblinear')
lm.fit(X_train, y_train)

In [None]:
lm.coef_

### Validation

#### Training Metrics

In [None]:
print(classification_report(y_train, lm.predict(X_train)))

In [None]:
plot_confusion_matrix(lm, X_train, y_train)

#### Validation metrics

In [None]:
print(classification_report(y_val, lm.predict(X_val)))

In [None]:
plot_confusion_matrix(lm, X_val, y_val)

### Decisison Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train, y_train)

In [None]:
{col:round(fi,3) for col, fi in zip(FEATURES, DTC.feature_importances_)}

### Validation

#### Training Metrics

In [None]:
print(classification_report(y_train, DTC.predict(X_train)))

In [None]:
plot_confusion_matrix(DTC, X_train, y_train)

#### Validation metrics

In [None]:
plot_confusion_matrix(DTC, X_val, y_val)

In [None]:
print(classification_report(y_val, DTC.predict(X_val)))

### Random Forest Classifier - Using Defaults

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [None]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

In [None]:
{col:round(fi,3) for col, fi in zip(FEATURES, rf_clf.feature_importances_)}

### Validation

#### Training Metrics

In [None]:
print(classification_report(y_train, rf_clf.predict(X_train)))

In [None]:
plot_confusion_matrix(rf_clf, X_train, y_train)

#### Validation metrics

In [None]:
plot_confusion_matrix(rf_clf, X_val, y_val)

In [None]:
print(classification_report(y_val, rf_clf.predict(X_val)))

In [None]:
len(rf_clf.estimators_)

In [None]:
rf_clf.estimators_[0].feature_importances_

### Random Forest Classifier - Not using defaults

In [None]:
random_params = {
    'max_depth': 30,
    'min_samples_split': 17,
    'max_samples': 0.8,
    'min_samples_leaf': 50,
    'criterion': 'gini',
    'n_estimators': 150
}

In [None]:
rf_clf = RandomForestClassifier(oob_score=True,
                             n_jobs=16,
                             random_state=SEED,
                             verbose=0,
                             class_weight="balanced",
                             max_features=None,
                             **random_params)
rf_clf.fit(X_train, y_train)

In [None]:
{col:round(fi,3) for col, fi in zip(FEATURES, rf_clf.feature_importances_)}

### Validation

#### Training Metrics

In [None]:
print(classification_report(y_train, rf_clf.predict(X_train)))

In [None]:
plot_confusion_matrix(rf_clf, X_train, y_train)

#### Validation metrics

In [None]:
plot_confusion_matrix(rf_clf, X_val, y_val)

In [None]:
print(classification_report(y_val, rf_clf.predict(X_val)))

### Random Forest Classifier - Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
space = {
    'max_depth': range(2, 9, 3),
    'min_samples_split' : range(2, 9, 3),
    'max_samples' : np.linspace(0.5, 0.7, 3),
    'min_samples_leaf' : range(2, 9, 3),
    'criterion' : ['gini', 'entropy'],
    'n_estimators' : [10, 15]
}

In [None]:
rf_clf = RandomForestClassifier(oob_score=False,
                             n_jobs=-1,
                             random_state=SEED,
                             verbose=0,
                             class_weight="balanced",
                             max_features=None)

clf = GridSearchCV(rf_clf, space, cv=5, scoring='f1_macro') #https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

clf.fit(X_train, y_train)

In [None]:
clf.cv_results_.keys()

In [None]:
clf.best_score_, clf.best_params_

In [None]:
len(clf.cv_results_['mean_test_score'])*5 # Number of forest that were created

### Refit with best params

In [None]:
tuned_rf = RandomForestClassifier(oob_score=False,
                                  n_jobs=-1,
                                  random_state=SEED,
                                  verbose=0,
                                  class_weight="balanced",
                                  max_features=None, 
                                  **clf.best_params_)
tuned_rf.fit(X_train, y_train)

### Validation

#### Training Metrics

In [None]:
print(classification_report(y_train, tuned_rf.predict(X_train)))

In [None]:
plot_confusion_matrix(tuned_rf, X_train, y_train)

#### Validation metrics

In [None]:
plot_confusion_matrix(tuned_rf, X_val, y_val)

In [None]:
print(classification_report(y_val, tuned_rf.predict(X_val)))

### Detour: Distributions

In [None]:
# https://docs.scipy.org/doc/scipy/reference/stats.html
from scipy.stats import randint, poisson, uniform, norm, halfnorm
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
randint_sample = randint(0,10)
plt.hist(randint_sample.rvs(5000))

In [None]:
uniform_sample = uniform(0,1)
plt.hist(uniform_sample.rvs(5000))

In [None]:
poisson_sample = poisson(10)
plt.hist(poisson_sample.rvs(1000), bins=range(0,20))

In [None]:
norm_sample = norm(10, 0.1)
plt.hist(norm_sample.rvs(1000))

In [None]:
halfnorm_sample = halfnorm(10, 0.1)
plt.hist(halfnorm_sample.rvs(1000))

### Random Forest Classifier - Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
uniform(0.5,0.3).rvs(10)

In [None]:
space = {
    'max_depth': randint(2, 9),
    'min_samples_split' : randint(2, 9),
    'max_samples' : uniform (0.5, 0.3), # arguments are loc & scale and the distribution will stay between [loc, loc + scale] i.e. 0.5 and 0.8
    'min_samples_leaf' : randint(2, 9),
    'criterion' : ['gini', 'entropy'],
    'n_estimators' : randint(10,15)
}

In [None]:
rf_clf = RandomForestClassifier(oob_score=False,
                             n_jobs=-1,
                             random_state=SEED,
                             verbose=0,
                             class_weight="balanced",
                             max_features=None)

clf = RandomizedSearchCV(estimator=rf_clf, param_distributions=space, cv=5, scoring='f1_macro', n_iter=20) # 100 experiments

clf.fit(X_train, y_train)

In [None]:
clf.cv_results_.keys()

In [None]:
clf.best_score_, clf.best_params_

### Refit with best params

In [None]:
tuned_rf = RandomForestClassifier(oob_score=False,
                                  n_jobs=-1,
                                  random_state=SEED,
                                  verbose=0,
                                  class_weight="balanced",
                                  max_features=None, 
                                  **clf.best_params_)
tuned_rf.fit(X_train, y_train)

### Validation

#### Training Metrics

In [None]:
print(classification_report(y_train, tuned_rf.predict(X_train)))

In [None]:
plot_confusion_matrix(tuned_rf, X_train, y_train)

#### Validation metrics

In [None]:
plot_confusion_matrix(tuned_rf, X_val, y_val)

In [None]:
print(classification_report(y_val, tuned_rf.predict(X_val)))

# Hyperopt

Read more about Hyperopt [here](https://towardsdatascience.com/hyperparameter-optimization-in-python-part-2-hyperopt-5f661db91324), [here](https://github.com/hyperopt/hyperopt/wiki/FMin),
[here](https://towardsdatascience.com/an-introductory-example-of-bayesian-optimization-in-python-with-hyperopt-aae40fff4ff0) and [here](https://maelfabien.github.io/machinelearning/HyperOpt/#hyperopt)

In [None]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from sklearn.model_selection import KFold

from hyperopt import STATUS_OK
from hyperopt import Trials

In [None]:
NUM_FOLDS = 5
NUM_EVALS = 20

Defining search spaces is where hyperopt shines. There is a ton of sampling options to choose from:
* **Categorical parameters**-use **hp.choice**
* **Integer parameters**-you can use **hp.randit, hp.quniform, hp.qloguniform or hp.qlognormal** which really gives you a lot of options to model your integer hyperparameter space
* **Float parameters**- similarly to integer parameters you really get to choose what works for your problem with **hp.normal, hp.uniform, hp.lognormal and hp.loguniform**

In [None]:
bayes_trials = Trials()

In [None]:
space = {
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_samples_split' : hp.quniform('min_samples_split', 2, 50, 1),
    'n_estimators': hp.quniform('n_estimators', 20, 50, 1),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_samples' : hp.uniform('max_samples', 0.5, 1),
    'min_samples_leaf' : hp.randint('min_samples_leaf', 2, 9)
}

In [None]:
def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'min_samples_split' : int(params['min_samples_split']),
        'n_estimators' : int(params['n_estimators']),
        'criterion' : params['criterion'],
        'max_samples' : float(params['max_samples']),
        'min_samples_leaf': int(params['min_samples_leaf'])
    }

    clf = RandomForestClassifier(oob_score=True,
                                 n_jobs=-1,
                                 random_state=SEED,
                                 verbose=0,
                                 **params)

    score = cross_val_score(estimator=clf,
                            X=train_df[FEATURES].values, #
                            y=train_df[DEP_VARS[0]].values, #
                            scoring='f1_macro',
                            cv=KFold(n_splits=NUM_FOLDS, # Stratified
                                               shuffle=True,
                                               random_state=SEED)).mean()
    
    print("F1 Score {:.3f} params {}".format(score, params))
    return {"loss":-1.0*score, "params":params, "status":STATUS_OK}

In [None]:
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=NUM_EVALS, trials=bayes_trials)
print(best)

In [None]:
optimal_params = {
    'max_depth': int(best['max_depth']),
    'min_samples_split': int(best['min_samples_split']),
    'n_estimators': int(best['n_estimators']),
    'criterion': ['gini', 'entropy'][best['criterion']],
    'max_samples' : float(best['max_samples']),
    'min_samples_leaf': int(best['min_samples_leaf'])
}

print(optimal_params)

In [None]:
bayes_trials.trials

In [None]:
best2 = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100, trials=bayes_trials)
print(best2)

In [None]:
tuned_rf = RandomForestClassifier(**{
    "oob_score":True,
    "n_jobs":-1,
    "random_state":SEED,
    "verbose":1,
    "class_weight":"balanced",
    **optimal_params
})

tuned_rf.fit(X=X_train, y=y_train)

In [None]:
preds_train = tuned_rf.predict(X_train)
preds_val = tuned_rf.predict(X_val)

### Validation

#### Training Metrics

In [None]:
print(classification_report(y_train, tuned_rf.predict(X_train)))

In [None]:
plot_confusion_matrix(tuned_rf, X_train, y_train)

#### Validation metrics

In [None]:
plot_confusion_matrix(tuned_rf, X_val, y_val)

In [None]:
print(classification_report(y_val, tuned_rf.predict(X_val)))