# TPS Feb 2021
Starter Notebook

## Deleverables
1. EDA
    - What's going on?
    - Show me the data...
2. Model
    - Baseline...
    - Simple...
    - Evaluation...
    - Improvement...
3. RAPIDS Bonus
    - Apply RAPIDS ([Starter Notebook](https://www.kaggle.com/tunguz/tps-feb-2021-rapids-starter))
    - Replace pandas with cuDF & sklearn with cuML
    
    
#### Troubleshooting
- [Data](https://www.kaggle.com/c/tabular-playground-series-feb-2021/data)
- [Overview](https://www.kaggle.com/c/tabular-playground-series-feb-2021/overview)
- [RF Starter Notebook](https://www.kaggle.com/warobson/tps-feb-2021-rf-starter)
- [ML repo on GitHub](https://github.com/gumdropsteve/intro_to_machine_learning)
- [Most simple RAPIDS Notebook submission](https://www.kaggle.com/warobson/simple-rapids-live) (Has stuff like `train_test_split()` with cuml..)
    
#### Load Data

In [None]:
import cudf
import numpy as np

train = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sample_submission = cudf.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
train.tail(3)

In [None]:
test.tail(3)

In [None]:
sample_submission.tail(3)

In [None]:
print(train.shape ,'\n', test.shape, '\n', sample_submission.shape)

### Notes
1. To use sklearn with cudf... convert `cudf.Series` / `cudf.DataFrame` `.to_pandas()`..
2. Using cudf is the same as using pandas but `cudf` instead of `pd`..

In [None]:
type(train)

In [None]:
type(train.to_pandas())

In [None]:
train.to_pandas()

In [None]:
train[['id', 'cat0', 'cat8', 'cont7']].to_pandas()

In [None]:
type(train.to_pandas())

In [None]:
y = train.target

y.to_pandas()

In [None]:
type(y.to_pandas())

# EDA

## Exploring Data


In [None]:
train.info()

In [None]:
test.info()

In [None]:
sample_submission.info()

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
sample_submission.isna().sum()

In [None]:
train.describe()

In [None]:
train.describe(exclude=[np.number])


In [None]:
import seaborn as sns
import pandas as pd 
import matplotlib.pyplot as plt
# tain.to_pandas()
plt.figure(figsize=(20,15))
sns.scatterplot(x='cont1' , y= 'cont2',size='cont0',data=train.sample(100).to_pandas(), hue='target');


In [None]:
train

In [None]:
train.columns

In [None]:

sns.pairplot(data=train.sample(10).to_pandas())


In [None]:
fig=plt.figure(figsize=(25,15))
plt.title('Outliers in boxplot chart ')
col=['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7','cat8', 'cat9']
sns.boxplot(data=train.drop(col , axis=1).to_pandas());


# Modeling 

## split the data

In [None]:
from cuml.preprocessing import train_test_split

X = train.drop('target', axis=1)
y = train.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)


## baseline

In [None]:

def baseline_model(n_preds, pred):
    # just predict the average
    return cudf.Series([pred for n in range(n_preds)])

# make baseline preds
baseline_preds = baseline_model(len(y_test), np.mean(y_train))


In [None]:
baseline_preds

In [None]:
from cuml.metrics import mean_squared_error

mean_squared_error(y_true=y_test,
                   y_pred=baseline_preds,
                   squared=False)

## Define a model..

In [None]:
trian=cudf.get_dummies(train)

In [None]:
from cuml.preprocessing import train_test_split

X = train.drop('target', axis=1)
X = cudf.get_dummies(X)

y = train.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

### Random forest Model

In [None]:
for n in X_train.columns:
    X_train[n]=X_train[n].astype(np.float32)


In [None]:

from cuml.ensemble import RandomForestRegressor

# from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=3000 ,)
rfr.fit(X_train,y_train)


In [None]:
rf_preds =rfr.predict(X_test)
rf_preds

In [None]:

mean_squared_error(y_true=y_test.astype(np.float64),
                   y_pred=rf_preds.astype(np.float64),
                   squared=False)

### optimize the model 

In [None]:
for c in X.columns:
    if c not in test.columns:
        print(c)

In [None]:
X.cat6_G.value_counts()

In [None]:
train.cat6.value_counts()

In [None]:
test['cat6_G'] = 0

In [None]:
%%time
import numpy as np
from cuml.ensemble import RandomForestRegressor
import cudf
from cuml.linear_model import LinearRegression

# data load
train = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sample_submission = cudf.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

# data prep
X = train.drop('target', axis=1)
X = cudf.get_dummies(X)

y = train.target

test = cudf.get_dummies(test)
test['cat6_G'] = 0  # fix lack of Gs in test data
for n in X.columns:
    X[n]=X[n].astype(np.float32)
# modeling
rfr = RandomForestRegressor(n_estimators=3000)
rfr.fit(X,y)
 
rf_preds =rfr.predict(test)

# save results & submit
sample_submission['target'] = rf_preds

sample_submission.to_csv('submission.csv', index=False)