# TPS Feb 2021
Starter Notebook

## Deleverables
1. EDA
    - What's going on?
    - Show me the data...
2. Model
    - Baseline...
    - Simple...
    - Evaluation...
    - Improvement...
3. RAPIDS Bonus
    - Apply RAPIDS ([Starter Notebook](https://www.kaggle.com/tunguz/tps-feb-2021-rapids-starter))
    - Replace pandas with cuDF & sklearn with cuML
    
    
#### Troubleshooting
- [Data](https://www.kaggle.com/c/tabular-playground-series-feb-2021/data)
- [Overview](https://www.kaggle.com/c/tabular-playground-series-feb-2021/overview)
- [RF Starter Notebook](https://www.kaggle.com/warobson/tps-feb-2021-rf-starter)
- [ML repo on GitHub](https://github.com/gumdropsteve/intro_to_machine_learning)
- [Most simple RAPIDS Notebook submission](https://www.kaggle.com/warobson/simple-rapids-live) (Has stuff like `train_test_split()` with cuml..)
    
#### Load Data

In [None]:
# hide warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# RAPIDS Bonus [We solve assignment in both ways Rapids/Sklearn]

# RAPIDS Random Forest Model


In [None]:
import cudf

train = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sample_submission = cudf.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
train.tail(3)

In [None]:
test.tail(3)

In [None]:
sample_submission.tail(3)

### Notes
1. To use sklearn with cudf... convert `cudf.Series` / `cudf.DataFrame` `.to_pandas()`..
2. Using cudf is the same as using pandas but `cudf` instead of `pd`..

In [None]:
type(train)

In [None]:
type(train.to_pandas())

In [None]:
train.to_pandas()

In [None]:
train[['id', 'cat0', 'cat8', 'cont7']].to_pandas()

In [None]:
type(train.to_pandas())

In [None]:
y = train.target

y.to_pandas()

In [None]:
type(y.to_pandas())

# EDA

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
# sns.pairplot(train.to_pandas().sample(100), hue='target');

In [None]:
train.to_pandas().corr().style.background_gradient(cmap='Blues')

In [None]:
sns.displot(train.to_pandas().target).set(title='Distripution of Target');

# Prepar Data to modling


## Change Float Type to Float32

In [None]:
train.columns

In [None]:
col_name = train[['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'target']]

for i in col_name:
    train[i] = train[i].astype(np.float32)

In [None]:
train.info()

## Encode Data

In [None]:
train_encode = cudf.get_dummies(train)

test_encode = cudf.get_dummies(test)

In [None]:
train_encode.info()

In [None]:
train_encode.columns

In [None]:
train_encode

In [None]:
# convert encode data to float32
col_name = ['id','cat0_A', 'cat0_B', 'cat1_A', 'cat1_B', 'cat2_A', 'cat2_B',
       'cat3_A', 'cat3_B', 'cat3_C', 'cat3_D', 'cat4_A', 'cat4_B', 'cat4_C',
       'cat4_D', 'cat5_A', 'cat5_B', 'cat5_C', 'cat5_D', 'cat6_A', 'cat6_B',
       'cat6_C', 'cat6_D', 'cat6_E', 'cat6_G', 'cat6_H', 'cat6_I', 'cat7_A',
       'cat7_B', 'cat7_C', 'cat7_D', 'cat7_E', 'cat7_F', 'cat7_G', 'cat7_I',
       'cat8_A', 'cat8_B', 'cat8_C', 'cat8_D', 'cat8_E', 'cat8_F', 'cat8_G',
       'cat9_A', 'cat9_B', 'cat9_C', 'cat9_D', 'cat9_E', 'cat9_F', 'cat9_G',
       'cat9_H', 'cat9_I', 'cat9_J', 'cat9_K', 'cat9_L', 'cat9_M', 'cat9_N',
       'cat9_O']

for i in col_name:
    train_encode[i] = train_encode[i].astype(np.float32)

## Split Data

In [None]:
from cuml.preprocessing import train_test_split

X = train_encode.drop('target', axis=1)
y = train_encode.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

## Scale Data

In [None]:
from cuml.experimental.preprocessing import StandardScaler
# scale the X data ONLY
scaler = StandardScaler()
# better save it to new var 
X_train = scaler.fit_transform(X_train) # Scale have to be after split data & on train only, .fit_transform() is only for train data
X_test  = scaler.transform(X_test)      # Scale have to be after split data & on test only,  .transform() is only for test data

# Baseline Model

In [None]:
def baseline_model(n_preds, pred):
    # just predict the average
    return cudf.Series([pred for n in range(n_preds)])

# make baseline preds
baseline_preds = baseline_model(len(y_test), np.mean(y_train))

In [None]:
# change preds type to float32
baseline_preds = baseline_preds.astype(np.float32)

In [None]:
from cuml.metrics import mean_squared_error

mean_squared_error(y_true=y_test,
                   y_pred=baseline_preds,
                   squared=False)

# Simple Model [Random Forest Regressor]

In [None]:
from cuml.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()

rfr.fit(X_train, y_train)

In [None]:
rfr_preds = rfr.predict(X_test)

rfr_preds.tail()

## Simple Model Evaluation

In [None]:
from cuml.metrics import mean_squared_error

cuml_model_mse = mean_squared_error(y_true=y_test,
                   y_pred=rfr_preds,
                   squared=False)
cuml_model_mse

# Improvement Model [Random Forest Regressor]

 
# Dask GridSearchCV on GPU

### Note: try to do it but failed 


In [None]:
# download 'dask-ml'
!pip -q install 'dask-ml'

In [None]:
import dask_ml.model_selection as dcv

In [None]:
dask_parameters = {
    'n_estimators': [100, 200, 500],
    'max_depth' : [5, 10 , 16, 20, 50],
    'split_criterion' :[2, 3]
    }

# ValueError: Invalid parameter bootstrap_features for estimator RandomForestRegressor(). 

skmodel_dask_grid = dcv.GridSearchCV(
                    rfr,
                    dask_parameters
                    )

In [None]:
# show Error: AttributeError: 'NoneType' object has no attribute 'fit'
# skmodel_dask_grid.fit(X_train,y_train)

# Sklearn Random Forest Model


# Improvement Model [Random Forest Regressor]

## We will use gridsearch to find best parameters for the model.
### But first Need to have Sklearn model 

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sample_submission = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
# Encode data
import category_encoders as ce

# Ordinally Encoded DF 
encoder = ce.OrdinalEncoder()
train_encode = encoder.fit_transform(train)

# Ordinally Encoded DF 
encoder = ce.OrdinalEncoder()
test_encode = encoder.fit_transform(test)

In [None]:
# Split Data
from sklearn.model_selection import train_test_split

X = train_encode.drop('target', axis=1)
y = train_encode.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Scale Data
from sklearn.preprocessing import StandardScaler
#Standardize features by removing the mean and scaling to unit variance
sc=StandardScaler()

#Compute the mean and std to be used for later scaling
X_train = sc.fit_transform(X_train)
#Fit to data, then transform it.
X_test = sc.transform(X_test)

In [None]:
# Random forest model
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()

rfr.fit(X_train, y_train)

In [None]:
preds = rfr.predict(X_test)

preds[-5: ]

In [None]:
# MSE for model
from sklearn.metrics import mean_squared_error

skl_model_mse = mean_squared_error(y_test, preds)
skl_model_mse

In [None]:
# MAE for model
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, preds)

# submit to kaggle

In [None]:
# to submit prediction of sklearn modle since it has better results
sklearn_preds = rfr.predict(X_test)

In [None]:
sample_submission['target'] = sklearn_preds
sample_submission.to_csv('submission.csv', index=False)

# Attempt GridSearch

In [None]:
''' 
# grid search with sklearn
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [100, 200, 500],
    'bootstrap': [True],
    'max_depth' : [5, 10 , 16, 20, 50]
}


grid = GridSearchCV(rfr,param_grid,cv=10)
grid
'''

In [None]:
# grid.fit(X_train, y_train)
#grid.best_params_
#grid.best_score_

'''
## Change Model Parameters
improved_rfr = RandomForestRegressor()

improved_rfr.fit(X_train, y_train)
'''

'''
# MSE for model
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, improved_preds)
'''