In [36]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve, auc
from sklearn.model_selection import KFold, cross_val_score
sns.set_style("darkgrid")

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler,MinMaxScaler
#Import data
import pickle5 as pickle

In [2]:
file_train_X = open("../data/X_train_03_preprocess","rb")
file_test_X = open("../data/X_test_03_preprocess","rb")
file_train_y = open("../data/y_regression_train_03_preprocess","rb")


X = pickle.load(file_train_X)
y = pickle.load(file_train_y)
X_test = pickle.load(file_test_X)


X.isnull().sum().sum() #No missing values
X.shape, y.shape

((1460, 71), (1460,))

In [3]:
y = np.log1p(y)
y.head()

Id
1    12.247699
2    12.109016
3    12.317171
4    11.849405
5    12.429220
Name: SalePrice, dtype: float64

In [4]:
for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X[c] = X[c].astype('category')

In [5]:
for c in X_test.columns:
    col_type = X_test[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X_test[c] = X_test[c].astype('category')

In [6]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [100]:
grid_parameters = {}
pipes = {}
grids = {}
regressors = ['mlp','lgbm']

grid_parameters['mlp'] = {
    'MLPRegressor__max_iter': [100,200],
    'MLPRegressor__activation' : ['relu'],
    'MLPRegressor__hidden_layer_sizes':[(10,10),(50,100,50)],}

grid_parameters['lgbm'] = {
    'lgbm__learning_rate' : [0.07],
    'lgbm__min_data_in_leaf' : list(range(1,35,5)),
    'lgbm__n_estimators' : list(range(10,500,50)),
    'lgbm__max_depth' : list(range(5,11,1)),}
    
    
pipes['mlp'] = Pipeline([('scaler',  MinMaxScaler()),
                        ('MLPRegressor', MLPRegressor(random_state = 303))])


pipes['lgbm'] = Pipeline([('scaler', RobustScaler()),('lgbm', LGBMRegressor(boosting_type='gbdt', objective='regression', 
                                                                         metric='neg_root_mean_squared_error'))])



mlp
lgbm


In [95]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import numpy as np

cv = KFold(n_splits=5, shuffle=True, random_state=303)

grids['lgbm'] = GridSearchCV (estimator = pipes['lgbm'],
                               param_grid = grid_parameters['lgbm'],
                               cv = cv,verbose=0,
                               n_jobs=-1)
grids['lgbm'].fit(X, y)

print('Best score')
print(grids['lgbm'].best_score_)
print('Best parameters')
print(grids['lgbm'].best_params_) 


Best score
0.8921470060614564
Best parameters
{'lgbm__learning_rate': 0.07, 'lgbm__max_depth': 5, 'lgbm__min_data_in_leaf': 6, 'lgbm__n_estimators': 310}


In [None]:
Best score
-0.13073824038468593
Best parameters
{'lgbm__learning_rate': 0.07, 'lgbm__max_depth': 5, 'lgbm__min_data_in_leaf': 6, 'lgbm__n_estimators': 310}

In [29]:
from sklearn.metrics import mean_squared_error

cv = KFold(n_splits=5, shuffle=True, random_state=303)

def fit_score(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_score(model, scoring, X=X):
    
    rmse = np.sqrt(-cross_val_score(model, X, y,scoring = scoring, cv=cv))
    
    return (rmse)

In [33]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

regressor  = grid.best_estimator_.steps[-1][-1]

In [32]:
print(fit_score(y, regressor.predict(X)))

1.0827899624039286


In [26]:
#neg_root_mean_squared_error
#neg_mean_squared_error
#neg_mean_absolute_error
#r2
#try grid
score = cv_score(regressor,"neg_mean_squared_error")
print("CV Light GBM: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )

CV Light GBM: 0.1313 (0.0047)



In [99]:
results = grids['mlp'].predict(X_test)

submission = pd.read_csv("../data_regression/sample_submission.csv")
submission.iloc[:,1] = pd.Series(np.expm1( grids['mlp'].predict(X_test)))


submission['SalePrice'].sum()
#submission.to_csv("../data_regression/submission.csv", index=False)

KeyError: 'mlp'

In [50]:
best_1 = pd.read_csv("../data_regression/best_1.csv")
best_1['SalePrice'].sum()

260544858.29269767

In [18]:
#file_model_lgbm = open("../data/05_modelling_GBM_regression", "wb")
#pickle.dump(grid,file_model_lgbm)
#file_model_lgbm.close()`

In [21]:
#regressor  = grid.best_estimator_.steps[-1][-1]`

In [94]:
#from sklearn.metrics import balanced_accuracy_score,accuracy_score

#accuracy_score(cv_true_y, cv_predicted_y), balanced_accuracy_score(cv_true_y, cv_predicted_y)

(0.928082191780822, 0.9271952953018765)

In [102]:
from sklearn import set_config
set_config(display='diagram')
pipeline

In [None]:
#print("Accuracy: %0.2f (+/- %0.2f)" % (cv_results.mean(), cv_results.std() * 2))