# Hey Everyone !

### In this notebook we'll go through the approach I used for the 30 days of ML Challenge !

### We'll go through EDA, Feature Engineering and model building 
## Let's get started !

In [None]:
import numpy as np
import pandas as pd

In [None]:
## load train and test data
train_data= pd.read_csv('../input/30-days-of-ml/train.csv')
test_data=pd.read_csv('../input/30-days-of-ml/test.csv')

In [None]:
train_data.describe()

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
from sklearn.model_selection import KFold
train_data['fold']=-1
kf=KFold(n_splits=5,shuffle=True,random_state=42)
for fold,(ti,vi) in enumerate(kf.split(train_data)):
    train_data.loc[vi,'fold']=fold

In [None]:
train_data.fold.value_counts()

# EDA

### For this problem we don't have much to explore, but lets see what we dealing with


In [None]:
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Plotting continuous features

In [None]:
figure, axis = plt.subplots(5, 3)
figure.set_figheight(25)
figure.set_figwidth(20)

axis[0, 0].scatter(x=train_data['cont0'],y=train_data['target'],s=.5)
axis[1, 0].scatter(x=train_data['cont1'],y=train_data['target'],s=.5)
axis[2, 0].scatter(x=train_data['cont2'],y=train_data['target'],s=.5)
axis[0, 2].scatter(x=train_data['cont3'],y=train_data['target'],s=.5)
axis[0, 1].scatter(x=train_data['cont4'],y=train_data['target'],s=.5)
axis[1, 1].scatter(x=train_data['cont5'],y=train_data['target'],s=.5)
axis[2, 1].scatter(x=train_data['cont6'],y=train_data['target'],s=.5)
axis[1, 2].scatter(x=train_data['cont7'],y=train_data['target'],s=.5)
axis[2, 2].scatter(x=train_data['cont8'],y=train_data['target'],s=.5)
axis[3, 0].scatter(x=train_data['cont9'],y=train_data['target'],s=.5)
axis[3, 1].scatter(x=train_data['cont10'],y=train_data['target'],s=.5)
axis[3, 2].scatter(x=train_data['cont11'],y=train_data['target'],s=.5)
axis[4, 0].scatter(x=train_data['cont12'],y=train_data['target'],s=.5)
axis[4, 1].scatter(x=train_data['cont13'],y=train_data['target'],s=.5)


In [None]:
sns.histplot(data=train_data,x='target')

In [None]:
for i in range(10):
    print(train_data['cat'+str(i)].value_counts())

### Plotting categorical features

In [None]:
figure, axis = plt.subplots(3, 4)
figure.set_figheight(9)
figure.set_figwidth(12)
axis[0, 0].scatter(x=train_data['cat0'],y=train_data['target'],s=5)
axis[0, 1].scatter(x=train_data['cat1'],y=train_data['target'],s=5)
axis[0, 2].scatter(x=train_data['cat2'],y=train_data['target'],s=5)
axis[0, 3].scatter(x=train_data['cat3'],y=train_data['target'],s=5)
axis[1, 0].scatter(x=train_data['cat4'],y=train_data['target'],s=5)
axis[1, 1].scatter(x=train_data['cat5'],y=train_data['target'],s=5)
axis[1, 2].scatter(x=train_data['cat6'],y=train_data['target'],s=5)
axis[1, 3].scatter(x=train_data['cat7'],y=train_data['target'],s=5)
axis[2, 0].scatter(x=train_data['cat8'],y=train_data['target'],s=5)
axis[2, 1].scatter(x=train_data['cat9'],y=train_data['target'],s=5)


### It's nice that we don't need to deal with null values !


# Feature Engineering

In [None]:
# traindf=traindf.drop(columns=['cont8','cont9','cont10'])
# testdf=testdf.drop(columns=['cont8','cont9','cont10'])

In [None]:
traindf=train_data.copy()
testdf=test_data.copy()

for i in range(10):
    map_=list(train_data['cat'+str(i)].unique())
    map_.sort()
    traindf['cat'+str(i)]=traindf['cat'+str(i)].apply(lambda x : map_.index(x))
    testdf['cat'+str(i)]=testdf['cat'+str(i)].apply(lambda x : map_.index(x))

In [None]:
sns.heatmap(traindf.corr(),annot=True,cmap='rocket',linewidths=0.2,annot_kws={'size':3})
fig=plt.gcf()
fig.set_size_inches(14,10)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

# Model Training

### I have already done 3 tunings on XGB, Took couple of hours on CPU !

In [None]:
# cat_cols=[column for column in traindf.columns if 'cat' in column]
# cont_cols=[column for column in traindf.columns if 'cont' in column]
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from xgboost import XGBRegressor
useful_cols=[column for column in traindf.columns if column not in ['target','id','fold']]
X=traindf[useful_cols]
y=traindf['target']

In [None]:
params1={'n_estimators': 10000,
 'max_depth': 2,
 'learning_rate': 0.074,
 'gamma': 0.4,
 'booster': 'gbtree',
 'min_child_weight': 1,
 'subsample': 0.7912492436244456,
 'colsample_bytree': 0.1613480080803224,
 'reg_alpha': 12.65778876193281,
 'reg_lambda': 50.25603582806218
        }
params2={'n_estimators': 10000,
 'max_depth': 2,
 'learning_rate': 0.35,
 'booster': 'gbtree',
 'subsample': 0.93,
 'colsample_bytree': 0.85,
 'reg_alpha': 35,
 'reg_lambda': 35,
 'n_jobs' :-1
        }
params3={'n_estimators': 10000,
 'max_depth': 3,
 'learning_rate': 0.035,
 'min_child_weight': 6,
 'subsample': 0.92,
 'colsample_bytree': 0.11,
 'reg_alpha': 1.22,
 'reg_lambda': 36
        }
# model1 = XGBRegressor(**params1)
# model2 = XGBRegressor(**params2)
# model1.fit(X_train,y_train, early_stopping_rounds=300, eval_set=[(X_val, y_val)], verbose=1000)
# model2.fit(X_train,y_train, early_stopping_rounds=300, eval_set=[(X_val, y_val)], verbose=1000)

In [None]:
## Loading the saved blends

blend=pd.read_csv('../input/30-days-blend/blend (1).csv')
blend_test=pd.read_csv('../input/30-days-blend/blend_test (1).csv')

# predictions=[]
# blend=traindf.copy();
# blend['pred1']=0
# blend['pred2']=0
# blend['pred3']=0
# blend_test=testdf.copy();
# blend_test['pred1']=0
# blend_test['pred2']=0
# blend_test['pred3']=0
# for i in range(5):
#     train=traindf.loc[traindf.fold!=i]
#     val=traindf.loc[traindf.fold==i]
#     X_train=train[useful_cols]
#     y_train=train['target']
#     X_val=val[useful_cols]
#     y_val=val['target']
#     model1 = XGBRegressor(**params1,random_state=i)
#     model2 = XGBRegressor(**params2,random_state=i)
#     model3 = XGBRegressor(**params3,random_state=i)
# #     print(model1.predict(train))
#     model1.fit(X_train,y_train, early_stopping_rounds=300, eval_set=[(X_val, y_val)], verbose=0)
#     model2.fit(X_train,y_train, early_stopping_rounds=300, eval_set=[(X_val, y_val)], verbose=0)
#     model3.fit(X_train,y_train, early_stopping_rounds=300, eval_set=[(X_val, y_val)], verbose=0)
#     blend.loc[blend.fold!=i,'pred1']+=model1.predict(X_train)
#     blend.loc[blend.fold!=i,'pred2']+=model2.predict(X_train)
#     blend.loc[blend.fold!=i,'pred3']+=model3.predict(X_train)
#     blend_test['pred1']+=model1.predict(testdf[useful_cols])
#     blend_test['pred2']+=model2.predict(testdf[useful_cols])
#     blend_test['pred3']+=model3.predict(testdf[useful_cols])
    
#     preds=(model1.predict(X_val)+model2.predict(X_val)+model3.predict(X_val))/3
#     print("Fold " + str(i),mean_squared_error(y_val,preds,squared=False))
# #     predictions.append((model1.predict(testdf[useful_cols])+model2.predict(testdf[useful_cols])+model3.predict(testdf[useful_cols]))/3)
# blend.to_csv('./blend.csv')
# blend_test.to_csv('./blend_test.csv')

In [None]:
blend_test

In [None]:
### Here we just average out the three, but the regression gave me better results ;)

preds_test=(blend_test['pred1']+blend_test['pred2']+blend_test['pred3'])/15

In [None]:
submission = pd.DataFrame({'id':testdf['id'],'target':preds_test})
submission.to_csv('submission_kfold.csv',index = False)  

# Conclusion

### Here I performed averaging, but performing regression or adding some weights intelligently gave me better results 