In this notebook, you will learn how to make your first submission to the **[Tabular Playground Series - Jan 2021](https://admin.kaggle.com/c/tabular-playground-series-jan-2021/overview)** competition. 

This notebook will help get into top 30% solutions.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
        
input_path = Path('/kaggle/input/tabular-playground-series-jan-2021/')

# Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv')
display(train.head())

In [None]:
test = pd.read_csv(input_path / 'test.csv')
display(test.head())

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv')
display(submission.head())

In [None]:
### Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from  sklearn.model_selection import train_test_split , StratifiedKFold

from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV , StratifiedKFold , KFold
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor


# the scaler - for standardisation
from sklearn.preprocessing import StandardScaler

In [None]:
#### Get Basic Details of the data files
print('Train data Shape : ' , train.shape)
print('Test data Shape : ' , test.shape)
print('Submission data Shape : ' , submission.shape)

print('#########################################')
print('Null Data Details - Train data')
print(train.isnull().sum())

print('#########################################')
print('Null Data Details - Test data')
print(test.isnull().sum())

#### Observations
1. There is no null data in train or test dataset
2. All features are numerical

In [None]:
print('Train Data Set -- >')
print(train.info())
print('######################################')
print('Test Dataset ====> ')
print(test.info())

In [None]:
## Descriptive Statistics
train.describe().T

##### Observation 
1. Some features have negative values
2. Most features have values ranging from 0 to 1 
3. Target feature has some records with 0 value. 

In [None]:
### Lets check the distribution of features
for i in train.columns :
    sns.distplot(train[i])
    plt.show()

In [None]:
target = 'target'
Id_Cols = ['id' ]
features = [x for x in train.columns if x not in [target]+Id_Cols]
X = train[features]
y = train['target']
X

In [None]:
#### Function to run alogorithm with Cross validation
def RunForAll(algo , k , train , test , features , params):
    
    ## Create matrix of zeros
    val_set_pred = np.zeros(len(train))
    test_set_pred = np.zeros(len(test))
    #y_val = np.zeros(len(train))
    
    X = train[features]
    y = train['target']
    
    kf = KFold(n_splits=k)
    
    for fold_, (train_index, val_index) in enumerate(kf.split(train , train['target'])):
        print(f'\n ---------------------Fold {fold_ + 1}-----------------')
        
        target = train['target']
        X_train , y_train = train[features].iloc[train_index] , target.iloc[train_index]
        X_val , y_val = train[features].iloc[val_index] , target.iloc[val_index]
        
        #New
        X_train = X_train.abs()
        X_test = test[features].abs()
        
        _ = algo.fit(X_train , y_train , eval_set = [(X_val , y_val)] , **params)
        
        prediction_val = algo.predict(X_val)
        
        kf_score = np.sqrt(mean_squared_error(y_val , prediction_val))
        print(f'\n Score For Validation Sample is {kf_score}')
        
        val_set_pred[val_index] = prediction_val
        y_val = target.iloc[val_index]
        #Predict for test 
        prediction_test = algo.predict(X_test)
        test_set_pred += prediction_test / k
        
    val_score = np.sqrt(mean_squared_error(target,val_set_pred  ))
    print(f'\n Score for Validation set is {val_score}')
    
    return val_set_pred , test_set_pred , target        

In [None]:
xgb=XGBRegressor(n_estimators = 4000 , learning_rate = 0.011  )
params = {'verbose' : False , 'early_stopping_rounds' : 100}
target = 'target'
Id_Cols = ['id' ]

features = [x for x in train.columns if x not in [target]+Id_Cols]

xgb_val_pred , xgb_test_preds , y_val_xgb = RunForAll(xgb,5, train , test , features , params)

In [None]:
coef1 = pd.Series(xgb.feature_importances_, features).sort_values()
coef1.plot(kind='bar' , title = 'Model Coefficient')
plt.show()

In [None]:
xgb_val_pred , xgb_test_preds , y_val_xgb

In [None]:
from lightgbm import LGBMRegressor

#lgb = LGBMRegressor(n_estimators = 8000 , importance_type = 'gain' ,  learning_rate = 0.001 , tree_method= gpu_hist ,
#             predictor= gpu_predictor)

lgb = LGBMRegressor(n_estimators = 100000, metric = 'rmse' ,  learning_rate = 0.01 , boosting_type = 'gbdt' ,
             num_leaves = 200 , feature_fraction = 0.6 , lambda_l1 = 2 , lambda_l2 = 2 , min_child_samples = 50 , bagging_fraction = 0.4 ,
                   bagging_freq = 1 , verbosity=-1 , max_depth = 12 , max_bin = 200 ,
                   objective = 'regression')

params = {'verbose' : 100 ,  'early_stopping_rounds' : 1000 }
target = 'target'
Id_Cols = ['id' ]

features = [x for x in train.columns if x not in [target]+Id_Cols]

lgb_val_pred , lgb_test_preds , y_val_lgb = RunForAll(lgb,5, train , test , features , params)

In [None]:
coef1 = pd.Series(lgb.feature_importances_, features).sort_values()
coef1.plot(kind='bar' , title = 'Model Coefficient')
plt.show()

In [None]:
from catboost import CatBoostRegressor

cb = CatBoostRegressor(n_estimators = 5000 ,  learning_rate = 0.005 )

params = {'verbose' : False , 'early_stopping_rounds' : 100}
target = 'target'
Id_Cols = ['id' ]

features = [x for x in train.columns if x not in [target]+Id_Cols]

cb_val_pred , cb_test_preds , y_val_cb = RunForAll(cb,5, train , test , features , params)

In [None]:
test['target'] = cb_test_preds
output = pd.DataFrame({'id' : test.id , 'target' : test.target})
output.to_csv('submission_cb.csv' , index=False)
output.head()

###### Lets Try to use Ensemble technique

In [None]:
#target = train['target']
ensemble_df = pd.DataFrame(xgb_val_pred , columns=['xgboost'])

ensemble_df['lgboost'] = lgb_val_pred
ensemble_df['cbboost'] = cb_val_pred
#ensemble_df['label'] = y_val_xgb
ensemble_df.head()

In [None]:
ensemble_test_df = pd.DataFrame(xgb_test_preds , columns=['xgboost'])
ensemble_test_df['lgboost'] = lgb_test_preds
ensemble_test_df['cbboost'] = cb_test_preds
ensemble_test_df.head()

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
X = ensemble_df
y = y_val_xgb
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
print(reg.intercept_)

In [None]:
ensemble_preds = reg.predict(ensemble_test_df)
ensemble_preds

In [None]:
test['target'] = ensemble_preds
output = pd.DataFrame({'id' : test.id , 'target' : test.target})
output.to_csv('submission.csv' , index=False)
output.head()

In [None]:
test['target'] = cb_test_preds
output = pd.DataFrame({'id' : test.id , 'target' : test.target})
output.to_csv('submission_1.csv' , index=False)
output.head()

In [None]:
from sklearn.tree import DecisionTreeRegressor
X = ensemble_df
y = y_val_xgb
dregr = DecisionTreeRegressor(max_depth=5)
dregr.fit(X, y)
ensemble_preds_dr = dregr.predict(ensemble_test_df)


In [None]:
test['target'] = ensemble_preds_dr
output = pd.DataFrame({'id' : test.id , 'target' : test.target})
output.to_csv('submission_2.csv' , index=False)
output.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
X = ensemble_df
y = y_val_xgb
rfregr = RandomForestRegressor(n_estimators = 200 , max_depth=5 , min_samples_leaf=100 , n_jobs=4)
rfregr.fit(X, y)
ensemble_preds_rf = rfregr.predict(ensemble_test_df)

In [None]:
test['target'] = ensemble_preds_rf
output = pd.DataFrame({'id' : test.id , 'target' : test.target})
output.to_csv('submission_3.csv' , index=False)
output.head()