In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
import itertools
import time

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, cross_val_score, KFold, ParameterGrid
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score, \
roc_curve, auc, precision_score, recall_score, confusion_matrix, f1_score, precision_recall_curve, \
make_scorer
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from patsy import dmatrix
from pyearth import Earth

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
nan_cols = test.isna().sum()
test1 = test.drop(nan_cols[nan_cols > 269].index.tolist(), axis = 1)
train1 = train.drop(nan_cols[nan_cols > 269].index.tolist(), axis = 1)

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)

# fit and transform the data
test_imputed = pd.DataFrame(imputer.fit_transform(test1), columns=test1.columns)
train_imputed = pd.DataFrame(imputer.fit_transform(train1), columns=train1.columns)

In [25]:
X = train_imputed.drop(['id','y'], axis = 1)
y = train_imputed.y
X_test = test_imputed.drop('id', axis = 1)

In [26]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_test_scaled = scaler.transform(X_test)

### Decision Tree

In [27]:
model = DecisionTreeRegressor(random_state=45)
model.fit(X_scaled,y)

In [28]:
import warnings 
warnings.filterwarnings("ignore")
print("Maximum tree depth:", model.get_depth())
print("Maximum leaves:", model.get_n_leaves())

Maximum tree depth: 39
Maximum leaves: 2962


#### Tuning Decision Tree

In [29]:
cv = KFold(n_splits=5,shuffle=True,random_state=1)
parameters = {'max_depth': range(2, 40, 3), 
              'max_leaf_nodes': range(2, 2962, 200),
              'min_samples_leaf': range(1,9)}
model = GridSearchCV(DecisionTreeRegressor(random_state=45), parameters, n_jobs=-1,verbose=1,cv=cv,
                    scoring=['neg_mean_absolute_error','r2'], 
                     refit = 'neg_mean_absolute_error')

In [31]:
model.fit(X_scaled, y)
print(model.best_score_, model.best_params_)

Fitting 5 folds for each of 1560 candidates, totalling 7800 fits


KeyboardInterrupt: 

In [None]:
fig, axes = plt.subplots(1,3,figsize=(14,5))
plt.subplots_adjust(wspace=0.2)
axes[0].plot(cv_results.param_max_depth, -cv_results.mean_test_neg_mean_absolute_error, 'o')
axes[0].set_ylim([65200, 66000])
axes[0].set_xlabel('Depth')
axes[0].set_ylabel('K-fold MAE')
axes[1].plot(cv_results.param_max_leaf_nodes, -cv_results.mean_test_neg_mean_absolute_error, 'o')
axes[1].set_ylim([65200, 66000])
axes[1].set_xlabel('Leaves')
axes[1].set_ylabel('K-fold MAE')
axes[2].plot(cv_results.param_min_samples_leaf, -cv_results.mean_test_neg_mean_absolute_error, 'o')
axes[2].set_ylim([65200, 66000])
axes[2].set_xlabel('Leaves')
axes[2].set_ylabel('K-fold MAE');