### Importing important libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

#### Testing Dataset

In [2]:
test_data = pd.read_csv("TEST.csv")

#### Training Dataset

In [3]:
data = pd.read_csv("TRAIN.csv")
data.shape

(8000, 5)

Out of 5 columns first four are our features and last column is our target column/label

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      8000 non-null   float64
 1   V       8000 non-null   float64
 2   AP      8000 non-null   float64
 3   RH      8000 non-null   float64
 4   PE      8000 non-null   float64
dtypes: float64(5)
memory usage: 312.6 KB


So, there are no such columns in which values are incomplete.

In [12]:
data.describe()

Unnamed: 0,AT,V,AP,RH,PE
count,8000.0,8000.0,8000.0,8000.0,8000.0
mean,19.677299,54.302629,1013.235534,73.260478,454.310364
std,7.448395,12.687358,5.920858,14.588452,17.056199
min,2.34,25.36,993.11,25.56,420.26
25%,13.51,41.74,1009.07,63.34,439.7275
50%,20.32,52.08,1012.95,74.88,451.495
75%,25.77,66.54,1017.19,84.7,468.4
max,37.11,81.56,1033.3,100.16,495.76


In [5]:
import pywedge as pw
ppd = pw.Pre_process_data(data, test_data, y='PE',c=None,type="Regression")

In [6]:
new_X, new_y, new_test = ppd.dataframe_clean()

Reading the datasets...
******************************************

Train Dataframe summary...
******************************************

Your selected train dataframe has 5 columns and 8000 Rows.
There are 0 columns that have missing values.
Empty DataFrame
Columns: [Zero Values, Missing Values, % of Total Values, Total Zero Missing Values, % Total Zero Missing Values, Data Type]
Index: []
******************************************
Starting data cleaning...
******************************************
Do you want to use get_dummies or catcodes to convert categorical to numerical? 
	press 1 for catcodes - Quick info link - https://bit.ly/3lruqtf 
	press 2 for getdummies - Quick info link - https://bit.ly/3d76p7A 
1
Comleted categorical column transformation
******************************************
Do you want to standardize the data? 
	press 1 for Standard Scalar - Quick info link - https://bit.ly/2GPyG6w 
	press 2 for Robust Scalar - Quick info link - https://bit.ly/3jFNCD5 
	press 3

In [7]:
X_train = new_X
y_train = new_y
X_test = new_test

In [8]:
blm = pw.baseline_model(X_train,y_train)

In [9]:
blm.Regression_summary()

Starting regression summary...
TOP 10 FEATURE IMPORTANCE TABLE
AT    0.540130
V     0.227487
RH    0.157158
AP    0.075224
dtype: float64
Please enter test size: (for eg. please enter 0.20 for 20% test size): 
	0.2
--------------------------LINEAR MODELS---------------------------------
Linear Reg     | exp_var = 0.927 | mae = 3.626 | rmse = 1.904 | r2 = 0.927 | Train time = 1.391s | Pred. time = 0.000s
KNN            | exp_var = 0.943 | mae = 2.979 | rmse = 1.726 | r2 = 0.942 | Train time = 0.031s | Pred. time = 0.016s
LinearSVR      | exp_var = 0.914 | mae = 5.371 | rmse = 2.317 | r2 = 0.844 | Train time = 0.405s | Pred. time = 0.000s
Lasso          | exp_var = 0.927 | mae = 3.632 | rmse = 1.906 | r2 = 0.927 | Train time = 0.094s | Pred. time = 0.000s
Ridge          | exp_var = 0.927 | mae = 3.626 | rmse = 1.904 | r2 = 0.927 | Train time = 0.126s | Pred. time = 0.000s
------------------------NON LINEAR MODELS----------------------------------
---------------------THIS MIGHT TAKE A WH

In the dockship challenge, we can notice RMSE is the metric used for rankings. From above results, we can notice, RMSE for  XGBoost Regressor and CatBoost Regressor are minimum. Hence, we will use these two algorithms to train our model.
Also, we can tune our hyperparameters to get more refined results using these algorithms.

### Using XGBoost Regressor

In [47]:
from xgboost import XGBRegressor

In [500]:
X_t,X_v,Y_t,Y_v = train_test_split(X_train.values,y_train.values,test_size=0.2,random_state=42)

In [49]:
xgr =XGBRegressor(learning_rate=0.01, n_estimators=10600, max_depth=6, min_child_weight=1, gamma=0, subsample=0.8,
                  colsample_bytree=0.75,reg_alpha=0.1, scale_pos_weight=1,random_state=1)

In [50]:
xg_model = xgr.fit(X_t,Y_t)

In [51]:
XG_predict = xg_model.predict(X_v)

In [52]:
print(np.sqrt(mean_squared_error(XG_predict,Y_v)))

3.048431025264155


In [53]:
prediction = xg_model.predict(X_test.values)

In [54]:
pred_df  = pd.DataFrame(prediction,columns=['Value'])

In [116]:
pred_df.to_csv("XGBPredictions.csv",index=True,index_label='Index')

#### Random Forest

In [506]:
from sklearn.ensemble import RandomForestRegressor

In [507]:
rf = RandomForestRegressor(n_estimators=450,max_depth=23,max_samples=5000,max_features=2,min_samples_split=4,max_leaf_nodes=1370,min_samples_leaf=1,random_state=42)

In [508]:
rf_model = rf.fit(X_t,Y_t)

In [509]:
rf_predict = rf_model.predict(X_v)

In [510]:
print(np.sqrt(mean_squared_error(rf_predict,Y_v)))

3.4281066997082186


In [60]:
RF_predict = rf_model.predict(X_test.values)

In [61]:
rfpred_df  = pd.DataFrame(RF_predict,columns=['Value'])

In [119]:
rfpred_df.to_csv("RFPredictions.csv",index=True,index_label='Index')

### Stacked Regressor

In [62]:
from mlxtend.regressor import StackingCVRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

In [237]:
kfolds = KFold(n_splits=10,shuffle=True, random_state=42)

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X_t):
    rmse = np.sqrt(-cross_val_score(model, X, Y_t, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [238]:
stack_gen = StackingCVRegressor(regressors=(rf,xgr),
                                meta_regressor=rf,store_train_meta_features=True,use_features_in_secondary=True,random_state=42)

In [362]:
stack_gen_1 = StackingCVRegressor(regressors=(stack_gen,rf,xgr),
                                meta_regressor=rf,store_train_meta_features=True,use_features_in_secondary=True,random_state=42)

In [307]:
score = cv_rmse(xgr)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), )

score = cv_rmse(rf)
print("random forest: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), )

KeyboardInterrupt: 

In [363]:
print('stack_gen')
stack_model = stack_gen.fit(X_t,Y_t)

stack_gen


In [364]:
print('stack_gen_1')
stack_model_1 = stack_gen_1.fit(X_t,Y_t)

stack_gen_1


In [497]:
def blend_models(X):
    return ((0.25 * xg_model.predict(X)) + \
            (0.6 *  stack_model_1.predict(X)) + \
            (0.15 * stack_model.predict(X)))

In [498]:
blend_v = blend_models(X_v)

In [499]:
print('RMSE score on test data:')
print(rmse(Y_v, blend_v))

RMSE score on test data:
2.984267274674939


In [493]:
blendpredict = blend_models(X_test.values)
blendpred_df  = pd.DataFrame(blendpredict,columns=['Value'])
blendpred_df.to_csv("BP.csv",index=True,index_label='Index')

### CatBoost Regressor

In [543]:
from lightgbm import LGBMRegressor

In [554]:
lgbm = LGBMRegressor(learning_rate=0.2)
lgbm_model = lgbm.fit(X_t,Y_t)

In [547]:
lgbm_model?

In [555]:
LG_predict = lgbm_model.predict(X_v)

In [556]:
print(np.sqrt(mean_squared_error(LG_predict,Y_v)))

3.3691407234401733
