In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold, train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold, RandomizedSearchCV
import itertools as it
import time as time
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.ensemble import StackingRegressor, VotingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNetCV
from pyearth import Earth
from lightgbm import LGBMRegressor


In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

# fit and transform the data
test_imputed = pd.DataFrame(imputer.fit_transform(test), columns=test.columns)
train_imputed = pd.DataFrame(imputer.fit_transform(train), columns=train.columns)

corr_matrix = train_imputed.corr().abs()

corr_y = train_imputed.corrwith(train_imputed['y'])
train_clean = train_imputed.drop(corr_y[corr_y.isna()].index.tolist(), axis = 1)
test_clean = test_imputed.drop(corr_y[corr_y.isna()].index.tolist(), axis = 1)

In [None]:
X = train_clean.drop(['id','y'], axis = 1)
y = train_clean.y
X_test = test_clean.drop('id', axis = 1)

In [None]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_test_scaled = scaler.transform(X_test)

In [None]:
catboost = CatBoostRegressor(subsample=0.5,reg_lambda=0,num_leaves=31,n_estimators=1000,max_depth=6,learning_rate=0.05)
catboost.fit(X_scaled, np.log(y))

### Feature Selection: CatBoost

In [None]:
predictor = pd.Series(X.columns, name = 'predictor')
rel_importance = pd.Series(catboost.feature_importances_, name = 'importance')
importance = pd.concat([predictor, rel_importance], axis = 1).sort_values(by = 'importance', ascending = False)

In [None]:
catboost_features = importance['predictor'].loc[importance['importance'] > 0].tolist()

In [None]:
X_catboost = X[catboost_features]
test_catboost = test_clean[X_catboost.columns]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_catboost, y, test_size = 0.2, random_state = 1)

In [None]:
start_time = time.time()
param_grid = {'max_depth': [4,6,8],
              'num_leaves': [20, 31, 40],
              'learning_rate': [0.01, 0.05, 0.1],
              'reg_lambda':[0, 10, 100],
              'n_estimators':[100, 500, 1000],
              'subsample': [0.5, 0.75, 1.0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = RandomizedSearchCV(estimator=CatBoostRegressor(random_state=1, verbose=False),                                                       
                             param_distributions = param_grid, n_iter = 200,
                             verbose = 1,random_state = 1,
                             n_jobs=-1,
                             cv = cv)
optimal_params.fit(X_catboost,y)
print("Optimal parameter values =", optimal_params.best_params_)
print("Optimal cross validation R-squared = ",optimal_params.best_score_)
print("Time taken = ", round((time.time()-start_time)/60), " minutes")

In [None]:
catboost_final = CatBoostRegressor(subsample=0.5, reg_lambda=0, num_leaves=31, n_estimators=1000,
                                   max_depth=6, learning_rate=0.05).fit(X_catboost, np.log(y))


### Feature Selection: MARS

In [None]:
mars = Earth(max_terms = 500, max_degree = 1, feature_importance_type = 'rss')
mars.fit(X_catboost, np.log(y))

In [None]:
predictor = pd.Series(X.columns, name = 'predictor')
rel_importance = pd.Series(mars.feature_importances_, name = 'importance')
importance = pd.concat([predictor, rel_importance], axis = 1).sort_values(by = 'importance', ascending = False)

In [None]:
mars_pred = importance['predictor'].loc[importance['importance'] > 0].tolist()

In [None]:
X_mars = X[mars_pred]
test_mars = test_clean[X_mars.columns]

In [None]:
mars_final = Earth(max_terms = 500, max_degree = 1)
mars_final.fit(X_mars, np.log(y))

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_mars, y, test_size = 0.2, random_state = 1)

In [None]:
intercept = np.mean(y_test1 - np.exp(mars_final.predict(X_test1)))
intercept

3.1413378601503497

### Feature Selection: LightGBM

In [None]:
#K-fold cross validation to find optimal parameters for LightGBM regressor
start_time = time.time()
param_grid = {'max_depth': [4,6,8],
              'num_leaves': [20, 31, 40],
              'learning_rate': [0.01, 0.05, 0.1],
              'reg_lambda':[0, 10, 100],
              'n_estimators':[100, 500, 1000],
              'reg_alpha': [0, 10, 100],
              'subsample': [0.5, 0.75, 1.0],
              'colsample_bytree': [0.5, 0.75, 1.0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = RandomizedSearchCV(estimator=LGBMRegressor(random_state=1),                                                       
                             param_distributions = param_grid, n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv)
optimal_params.fit(X_mars,np.log(y))
print("Optimal parameter values =", optimal_params.best_params_)
print("Optimal cross validation R-squared = ",optimal_params.best_score_)
print("Time taken = ", round((time.time()-start_time)/60), " minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Optimal parameter values = {'subsample': 0.75, 'reg_lambda': 0, 'reg_alpha': 0, 'num_leaves': 31, 'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 0.5}
Optimal cross validation R-squared =  0.2765937476512421
Time taken =  1  minutes


In [None]:
lgbm_mars = LGBMRegressor(subsample=0.75, reg_lambda=0, reg_alpha=0, num_leaves=31, n_estimators=1000,
                     max_depth=4, learning_rate=0.01, colsample_bytree=0.5).fit(X_mars,np.log(y))

In [None]:
en = StackingRegressor(estimators = [('catboost_final', catboost_final),('lgbm', lgbm_mars),('mars_final', mars_final)],
                       final_estimator=ElasticNetCV(),                                          
                       cv = KFold(n_splits = 5, shuffle = True, random_state=1))
en.fit(X_scaled,np.log(y))

In [None]:
res = np.log(y) - en.predict(X_scaled)

In [None]:
en_res = StackingRegressor(estimators = [('catboost_final', catboost_final),('lgbm', lgbm_mars),('mars_final', mars_final)],
                       final_estimator=ElasticNetCV(),                                          
                       cv = KFold(n_splits = 5, shuffle = True, random_state=1))
en_res.fit(X_scaled,res)

In [None]:
pred = pd.DataFrame()
pred['id'] = test['id']
pred['y'] = np.exp(en.predict(X_test_scaled)+en_res.predict(X_test_scaled))*1.27
pred = pred.set_index(['id'])
pred.to_csv('Witarsa_Ashley.csv')