In [178]:
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split,RepeatedStratifiedKFold,cross_val_score,GridSearchCV
from numpy import mean, std
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from xgboost import XGBClassifier,XGBRegressor,XGBRFClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report,confusion_matrix

In [278]:
#Loading the data into pandas DataFrame
df = pd.read_csv('training_data/car.data', names=['buying', 'maint', 'doors', 'persons', 'boot', 'safety', 'class'])
#Leave out persons column|
df=df[['maint','doors','boot','safety','class','buying']]

In [279]:
#Check the distribution of target column
df['buying'].value_counts(normalize=True)

vhigh    0.25
high     0.25
med      0.25
low      0.25
Name: buying, dtype: float64

#### Target column distribution is quite balanced

#### Changing categorical range to numerical range (quantifying good, very good etc)

In [280]:
#Checking unique values
df['maint'].unique()
df['doors'].unique()
df['safety'].unique()
df['class'].unique()
df['buying'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [281]:
def quantify(data):
    if data in ["low","unacc","small"]:
        return 0
    elif data in ["med","acc"]:
        return 1
    elif data in ["high", "good", "big"]:
        return 2
    elif data in ["vhigh","vgood"]:
        return 3
    else:
        return data

In [282]:
#Transforming data to numbers
df_q = df.copy()
df_q['maint'] = df_q['maint'].apply(quantify)
df_q['doors'] = df_q['doors'].apply(lambda x: int(5) if x == "5more" else int(x))
df_q['boot'] = df_q['boot'].apply(quantify)
df_q['safety'] = df_q['safety'].apply(quantify)
df_q['class'] = df_q['class'].apply(quantify)
df_q['buying'] = df_q['buying'].apply(quantify)


In [283]:
df_q.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   maint   1728 non-null   int64
 1   doors   1728 non-null   int64
 2   boot    1728 non-null   int64
 3   safety  1728 non-null   int64
 4   class   1728 non-null   int64
 5   buying  1728 non-null   int64
dtypes: int64(6)
memory usage: 81.1 KB


#### Training

In [284]:
# Create X and y variables
X = df_q.drop(columns = ['buying'])
y = df_q['buying']

# Split the dataset by 0.75 and 0.25
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify = y, train_size=0.9)

# Check the shape of both train and val datasets
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(1555, 5) (173, 5) (1555,) (173,)


### Random Forest
Random forest is a supervised learning algorithm that can be used for classification. As the name suggests, this algorithm creates multiple decision trees on randomly selected samples and get prediction from each tree. Each tree will have equal votes and tree with the most votes is chosen as the final result.

In [271]:
rf_pipe = Pipeline([
        ('rf', RandomForestClassifier())
    ])
# Set up hyperparameters tuning
rf_params = {
    'rf__n_estimators':range(50, 200, 50),
    'rf__max_depth':range(30, 40),
    'rf__min_samples_leaf':range(1, 40, 10),
    'rf__random_state':[123]                       # [123]
}

In [272]:
gs = GridSearchCV(rf_pipe, param_grid = rf_params, cv = 5, scoring = 'f1_micro', verbose = 5, n_jobs = -1)

# Fit the model
gs.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


GridSearchCV(cv=5, estimator=Pipeline(steps=[('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'rf__max_depth': range(30, 40),
                         'rf__min_samples_leaf': range(1, 40, 10),
                         'rf__n_estimators': range(50, 200, 50),
                         'rf__random_state': [123]},
             scoring='f1_micro', verbose=5)

In [273]:
# Get the best parameters
best_params = gs.best_params_

# Get the best model
best_model = gs.best_estimator_

# # Predict validation set
pred = best_model.predict(X_val)

print(gs.best_score_)

0.2784565916398714


### Gradient Boost
Gradient Boost is another boosting ensemble model that takes an iterative approach to combining weak learners to create a strong learner by focusing on mistakes of prior iterations. For Gradient Boost, all the models are weighed equally and their predictive capacity is restricted with learning rate to increase accuracy.



In [298]:
gb_pipe = Pipeline([
        ('scale', StandardScaler()),
        ('gb', GradientBoostingClassifier())
    ])
# Set up hyperparameters tuning
gb_params = {
    'gb__learning_rate':  [0.1, 0.25, 0.5],            
    'gb__n_estimators': [100, 150, 200, 300],
    'gb__max_depth': [3, 5, 7],
    'gb__random_state': [123],
}


In [299]:
gb = GridSearchCV(gb_pipe, param_grid = gb_params, cv = 5, scoring = 'f1_micro', verbose = 1, n_jobs = -1)

# Fit the model
gb.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('gb', GradientBoostingClassifier())]),
             n_jobs=-1,
             param_grid={'gb__learning_rate': [0.1, 0.25, 0.5],
                         'gb__max_depth': [3, 5, 7],
                         'gb__n_estimators': [100, 150, 200, 300],
                         'gb__random_state': [123]},
             scoring='f1_micro', verbose=1)

In [301]:
# Get the best parameters
best_params = gb.best_params_

# Get the best model
best_model = gb.best_estimator_

# # Predict validation set
pred = best_model.predict(X_val)

print(gb.best_score_)
print(best_params)

0.2347266881028939
{'gb__learning_rate': 0.1, 'gb__max_depth': 3, 'gb__n_estimators': 100, 'gb__random_state': 123}


### XGBoost
XGBoost, also known as eXtreme Gradient Boosting, is an implementation of gradient boosted decision trees designed for speed and performance. Similar to Gradient Boosting, it is an ensemble tree method that applies the principle of boosting weak learners using the gradient descent architecture.

In [308]:
xgb_pipe = Pipeline([
        ('scale', StandardScaler()),
        ('xgb', XGBClassifier(objective = 'multi:softmax',
                              scale_pos_weight = 1,
                              seed = 123,
                              booster = 'gbtree',
                              eval_metric = None, 
                              use_label_encoder = False,
                              n_jobs = -1))
    ])
xgb_params = {
    'xgb__n_estimators': [200,225,250,300],
    'xgb__max_depth': [1,3,5,7],
}

In [309]:
xgb = GridSearchCV(xgb_pipe, param_grid = xgb_params, cv = 5, scoring = 'f1_micro', verbose = 1, n_jobs = -1)

# Fit the model
xgb.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Parameters: { "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster='gbtree',
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type=None,
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                          

In [310]:
# Get the best parameters
best_params = xgb.best_params_

# Get the best model
best_model = xgb.best_estimator_

# # Predict validation set
pred = best_model.predict(X_val)

print(xgb.best_score_)
print(best_params)

0.33054662379421224
{'xgb__max_depth': 1, 'xgb__n_estimators': 250}


In [315]:
test_df = pd.read_csv('testing_data/test.csv')
test_df

Unnamed: 0,maint,doors,boot,safety,class
0,2,4,2,2,2


In [319]:
pred = best_model.predict(test_df)
pred

array([0], dtype=int64)