In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from hyperopt import tpe,hp,Trials
from hyperopt.fmin import fmin

random_state = 101
path_import_and_export = "../../../../Thesis_data/processed_data/"

In [None]:
dtypes = {
    "MONTH":"int64",
    "DAY_OF_MONTH":"int64",
    "DAY_OF_WEEK":"int64",
    "OP_UNIQUE_CARRIER":"object",
    "TAIL_NUM":"object",
    "ORIGIN_AIRPORT_ID":"int64",
    "ORIGIN":"object",
    "ORIGIN_CITY_NAME":"object",
    "DEST":"object",
    "CRS_DEP_TIME":"Int64",
    "DEP_DEL15":"Int64",
    "DISTANCE_GROUP":"int64",
    "MANUFACTURE_YEAR":"Int64",
    "NUMBER_OF_SEATS":"Int64",
    "AWND":"float64",
    "PRCP":"float64",
    "SNOW":"float64",
    "SNWD":"float64",
    "TMAX":"float64",
    "MEDIAN_AGE":"float64",
    "TOT_POP":"Int64",
    "AVG_HOUSEHOLD_SIZE":"float64", 
    }

In [None]:
ontime_reporting = pd.read_csv(path_import_and_export + "ontime_reporting_clean_export.csv", dtype=dtypes)

X = ontime_reporting.loc[:, ontime_reporting.columns != "DEP_DEL15"]
y = ontime_reporting["DEP_DEL15"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X_train, y_train)

np.set_printoptions(precision=3)
sel_filter = pd.DataFrame([list(fit.scores_)], columns=ontime_reporting.columns[:-1])
sel_filter.head()

In [4]:
catColumns = X_train.select_dtypes(['object']).columns

le = LabelEncoder()

for col in catColumns:
    X_train[col]= le.fit_transform(X_train[col])

catColumns = X_test.select_dtypes(['object']).columns

le = LabelEncoder()

for col in catColumns:
    X_test[col]= le.fit_transform(X_test[col])

In [None]:
random_grid_rf = {
    "max_depth": hp.quniform('max_depth',5, 10, 1),
    "max_features": hp.choice('criterion', ['auto', 'sqrt','log2', None]),
    "n_estimators": hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
}

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

def objective(space):
    model = RandomForestClassifier(
        criterion = space['criterion'], 
        max_depth = int(space['max_depth']),
        max_features = space['max_features'],
        min_samples_leaf = space['min_samples_leaf'],
        min_samples_split = space['min_samples_split'],
        n_estimators = space['n_estimators'], 
    )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

In [5]:
rf = RandomForestClassifier(random_state=random_state)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

#scores = cross_val_score(rf, X_train, y_train, cv=5, scoring="roc_auc")

In [None]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid_rf, n_iter = 10, cv = 2, verbose=2, random_state=random_state, n_jobs = -1)

In [None]:
rf_random.fit(X_train, y_train)