In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import itertools as it
from sklearn.preprocessing import StandardScaler, normalize
import warnings

from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer, SimpleImputer
from scipy.stats import spearmanr

from sklearn.linear_model import LinearRegression,LogisticRegression, Lasso, LassoCV, RidgeCV, ElasticNetCV
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor, GradientBoostingRegressor, BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb
from catboost import CatBoostRegressor
from pyearth import Earth

In [93]:
train = pd.read_csv('/content/train.csv')
train.shape

(5380, 767)

In [94]:
test = pd.read_csv('/content/test.csv')
test.shape

(4403, 766)

### Missing Values

In [95]:
num_missing = train.isna().sum().sum()
print(num_missing)

num_cols_missing = (train.isna().any()).sum()
print(num_cols_missing)

num_rows_missing = (train.isna().any(axis=1)).sum()
print(num_rows_missing)

29911
473
2523


In [96]:
y_train = train['y']
X_train = train.drop(['y', 'id'], axis = 1)

In [97]:
col_id = test['id']
X_test = test.drop(['id'], axis = 1)

In [98]:
imputer = KNNImputer(n_neighbors = 5)
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns = X_test.columns)

## Feature Selection

In [99]:
selection0 = VarianceThreshold(threshold=0)
selection0.fit(X_train)
X_train = selection0.transform(X_train)
X_train.shape

(5380, 754)

In [100]:
X_test = selection0.transform(X_test)
X_test.shape

(4403, 754)

In [101]:
selection_low = VarianceThreshold(threshold=0.01)
selection_low.fit(X_train)
X_train = pd.DataFrame(selection_low.transform(X_train))
X_train.shape

(5380, 707)

In [102]:
X_test = pd.DataFrame(selection_low.transform(X_test))
X_test.shape

(4403, 707)

In [129]:
model_MARS_FI = Earth(max_terms=500, max_degree=3, feature_importance_type='rss')
model_MARS_FI.fit(X_train, y_train)

In [130]:
predictors = pd.Series(X_train.columns, name='predictor')
importances = pd.Series(model_MARS_FI.feature_importances_, name='importance')
MARS_FI = pd.concat([predictors, importances], axis=1).sort_values(by='importance', ascending=False)
MARS_FI[MARS_FI['importance']>0]

Unnamed: 0,predictor,importance
523,523,58810.5
34,34,58810.5
287,287,0.8951502
190,190,0.8951502
489,489,1.164153e-10
674,674,1.164153e-10


In [104]:
predictors = pd.Series(X_train.columns, name='predictor')
importances = pd.Series(model_MARS_FI.feature_importances_, name='importance')
MARS_FI = pd.concat([predictors, importances], axis=1).sort_values(by='importance', ascending=False)
MARS_FI[MARS_FI['importance']>0]

In [105]:
MARS_FI[MARS_FI['importance']>0]

Unnamed: 0,predictor,importance
92,92,110720.24204
108,108,44753.724717
523,523,33746.759733
332,332,33291.599983
236,236,23771.768063
98,98,19362.061771
552,552,14020.262608
41,41,9640.28547
558,558,8403.037025
135,135,7496.63249


In [133]:
X_train1 = X_train[[92, 108, 523, 332, 236, 98, 552, 41, 558, 135, 346, 529, 3, 639, 345, 274, 694, 13, 393, 211, 444, 190, 452]]
X_test1 = X_test[[92, 108, 523, 332, 236, 98, 552, 41, 558, 135, 346, 529, 3, 639, 345, 274, 694, 13, 393, 211, 444, 190, 452]]

## Modeling

In [134]:
model_CAT = CatBoostRegressor().fit(X_train1, y_train)

Learning rate set to 0.053413
0:	learn: 11.8727552	total: 2.31ms	remaining: 2.31s
1:	learn: 11.7264202	total: 4.32ms	remaining: 2.16s
2:	learn: 11.5978286	total: 6.13ms	remaining: 2.04s
3:	learn: 11.4702482	total: 7.99ms	remaining: 1.99s
4:	learn: 11.3532935	total: 9.85ms	remaining: 1.96s
5:	learn: 11.2487123	total: 11.7ms	remaining: 1.93s
6:	learn: 11.1305878	total: 13.5ms	remaining: 1.92s
7:	learn: 11.0310448	total: 15.5ms	remaining: 1.92s
8:	learn: 10.9303405	total: 17.4ms	remaining: 1.91s
9:	learn: 10.8455268	total: 19.2ms	remaining: 1.9s
10:	learn: 10.7573971	total: 21.1ms	remaining: 1.9s
11:	learn: 10.6762380	total: 23ms	remaining: 1.89s
12:	learn: 10.5930329	total: 24.8ms	remaining: 1.88s
13:	learn: 10.5174240	total: 26.7ms	remaining: 1.88s
14:	learn: 10.4424323	total: 28.5ms	remaining: 1.87s
15:	learn: 10.3706144	total: 30.4ms	remaining: 1.87s
16:	learn: 10.2971672	total: 32.1ms	remaining: 1.85s
17:	learn: 10.2318395	total: 33.8ms	remaining: 1.84s
18:	learn: 10.1772090	total: 3

In [135]:
model_ENET = ElasticNetCV(alphas=[0, 0.25, 0.5, 0.75, 1]).fit(X_train1, y_train)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model.fit(X, y)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [136]:
model_ensemble= StackingRegressor(estimators = [('CAT', model_CAT),
                                                ('ENET', model_ENET)],
                      final_estimator = LassoCV(),
                      cv = KFold(n_splits = 5, shuffle = True, random_state=1))
model_ensemble.fit(X_train1, y_train)

Learning rate set to 0.053413
0:	learn: 11.8727552	total: 2.49ms	remaining: 2.49s
1:	learn: 11.7264202	total: 4.58ms	remaining: 2.29s
2:	learn: 11.5978286	total: 6.67ms	remaining: 2.22s
3:	learn: 11.4702482	total: 8.82ms	remaining: 2.19s
4:	learn: 11.3532935	total: 10.7ms	remaining: 2.13s
5:	learn: 11.2487123	total: 12.4ms	remaining: 2.05s
6:	learn: 11.1305878	total: 14.1ms	remaining: 2.01s
7:	learn: 11.0310448	total: 15.8ms	remaining: 1.95s
8:	learn: 10.9303405	total: 17.4ms	remaining: 1.92s
9:	learn: 10.8455268	total: 19.1ms	remaining: 1.89s
10:	learn: 10.7573971	total: 20.9ms	remaining: 1.88s
11:	learn: 10.6762380	total: 22.5ms	remaining: 1.85s
12:	learn: 10.5930329	total: 24.2ms	remaining: 1.84s
13:	learn: 10.5174240	total: 26ms	remaining: 1.83s
14:	learn: 10.4424323	total: 27.9ms	remaining: 1.83s
15:	learn: 10.3706144	total: 29.9ms	remaining: 1.84s
16:	learn: 10.2971672	total: 31.7ms	remaining: 1.83s
17:	learn: 10.2318395	total: 33.6ms	remaining: 1.83s
18:	learn: 10.1772090	total:

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model.fit(X, y)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
4:	learn: 11.0968201	total: 9.86ms	remaining: 1.96s
5:	learn: 11.0017150	total: 12ms	remaining: 1.99s
6:	learn: 10.8962024	total: 14.1ms	remaining: 2s
7:	learn: 10.7913442	total: 16ms	remaining: 1.98s
8:	learn: 10.6917801	total: 17.8ms	remaining: 1.96s
9:	learn: 10.5885273	total: 19.7ms	remaining: 1.95s
10:	learn: 10.5037805	total: 21.5ms	remaining: 1.93s
11:	learn: 10.4341050	total: 23.2ms	remaining: 1.91s
12:	learn: 10.3628800	total: 25.1ms	remaining: 1.91s
13:	learn: 10.2960226	total: 26.6ms	remaining: 1.88s
14:	learn: 10.2235506	total: 28.2ms	remaining: 1.85s
15:	learn: 10.1566080	total: 30ms	remaining: 1.84s
16:	learn: 10.0969688	total: 31.9ms	remaining: 1.84s
17:	learn: 10.0326423	total: 33.4ms	remaining: 1.82s
18:	learn: 9.9734878	total: 35.1ms	remaining: 1.81s
19:	learn: 9.9177865	total: 36.7ms	remaining: 1.8s
20:	learn: 9.8672936	total: 38.7ms	remaining: 1.8s
21:	learn: 9.8178579	total: 40.4ms	remaining: 1.79s
22

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model.fit(X, y)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinat

In [137]:
model_ensemble.final_estimator_.coef_

array([0.82738383, 0.26690669])

## Prediction

In [138]:
pred = model_ensemble.predict(X_test1)

pred_v50 = pd.DataFrame(index=range(X_test1.shape[0]), columns=range(2))
pred_v50.columns = ['id', 'y']
pred_v50['id'] = col_id
pred_v50['y'] = pred
pred_v50

Unnamed: 0,id,y
0,5380,5.979268
1,5381,7.710547
2,5382,5.236843
3,5383,6.199842
4,5384,5.158988
...,...,...
4398,9778,5.119328
4399,9779,10.463923
4400,9780,10.267421
4401,9781,14.638503


In [139]:
pred_v50.to_csv('pred_v50.csv', index=True)