# TPS Feb 2021
Starter Notebook

## Deleverables
1. EDA
    - What's going on?
    - Show me the data...
2. Model
    - Baseline...
    - Simple...
    - Evaluation...
    - Improvement...
3. RAPIDS Bonus
    - Apply RAPIDS ([Starter Notebook](https://www.kaggle.com/tunguz/tps-feb-2021-rapids-starter))
    - Replace pandas with cuDF & sklearn with cuML
    
    
#### Troubleshooting
- [Data](https://www.kaggle.com/c/tabular-playground-series-feb-2021/data)
- [Overview](https://www.kaggle.com/c/tabular-playground-series-feb-2021/overview)
- [RF Starter Notebook](https://www.kaggle.com/warobson/tps-feb-2021-rf-starter)
- [ML repo on GitHub](https://github.com/gumdropsteve/intro_to_machine_learning)
- [Most simple RAPIDS Notebook submission](https://www.kaggle.com/warobson/simple-rapids-live) (Has stuff like `train_test_split()` with cuml..)
    


#### Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from cuml.preprocessing import train_test_split
import cudf
from cuml.ensemble import RandomForestRegressor
from cuml.linear_model import LinearRegression
from cuml.neighbors import KNeighborsRegressor
from cuml.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

### Load Data

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sample_submission = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
# we deleted the id columns beacuse it was unnecessary
train = train.drop(['id'] , axis  =1)
test = test.drop(['id'] , axis  =1)

In [None]:
train.tail(3)

In [None]:
test.tail(3)

In [None]:
sample_submission.tail(3)

In [None]:
train.describe()

In [None]:
train.describe(exclude= 'number')

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

### EDA

In [None]:
corr = train.corr()
corr.style.background_gradient(cmap='Blues')

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(data = train)

As shown in the graph on the target there are outliers below the minimum value, so we decided to drop them 

### Prepare the data 

#### Remove the outliers

In [None]:
Q1 = train["target"].quantile(0.25)

Q3 = train["target"].quantile(0.75)

IQR = Q3 - Q1

Lower_Fence = Q1 - (1.5 * IQR)

train = train.drop(train[train['target'] < Lower_Fence].index)
train

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(data = train)

#### Encode the data

In [None]:
le = LabelEncoder()
columns = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8','cat9']
for i in columns:
    train[i] = le.fit_transform(train[i].astype(str))

train.head()

In [None]:
le2 = LabelEncoder()
columns = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8','cat9']
for i in columns:
    test[i] = le2.fit_transform(test[i].astype(str))

test.head()

The models were facing some problems working with float64 in GPU that only accepts float32, so we converted of the columns types to **float32** 

In [None]:
# convert from pandas to cudf to use gpu
train = cudf.DataFrame(train).astype(np.float32)
test = cudf.DataFrame(test).astype(np.float32)

#### Split the data

In [None]:
x = train.drop('target', axis=1)
y = train.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

### Modeling

#### Baseline Model

In [None]:
def baseline_model(n_preds, pred):
    # just predict the average
    return cudf.Series([pred for n in range(n_preds)])

# make baseline preds
baseline_preds = baseline_model(len(y_test), np.mean(y_train)).astype(np.float32)

In [None]:
mean_squared_error(y_true=y_test,y_pred=baseline_preds,squared=False)

#### Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_preds = lr.predict(x_test)

In [None]:
mean_squared_error(y_true=y_test,y_pred=lr_preds,squared=False)

##### GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

p_grid = {'fit_intercept': [True, False],
          'normalize': [True, False]}

grid = GridSearchCV(lr, p_grid)
grid.fit(x_train , y_train)

In [None]:
grid.best_params_

#### Linear Regression

In [None]:
lr = LinearRegression(fit_intercept = True, normalize = False)
lr.fit(x_train, y_train)
lr_preds = lr.predict(x_test)

In [None]:
mean_squared_error(y_true=y_test,y_pred=lr_preds,squared=False)

#### Random Forest Model

In [None]:
#from cuml.ensemble import RandomForestRegressor
#est = [1000 , 1200]
#depth = [7 , 9 , 12 , 14 , 16]

#for e in est:
#    for d in depth:
#        for s in range(2):
#            rfr2 = RandomForestRegressor(n_estimators = e , max_depth = d)
#            rfr2.fit(x_train, y_train)
#            rfr2_preds = rfr2.predict(x_test).astype(np.float64)
#            print(mean_squared_error(y_true=y_test,y_pred=rfr2_preds,squared=False) , ' est: ',e , ' depth: ' , d)

In [None]:
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)
rfr_preds = rfr.predict(x_test)

In [None]:
mean_squared_error(y_true=y_test,y_pred=rfr_preds,squared=False)

In [None]:
rfr = RandomForestRegressor(n_estimators = 1000 , max_depth = 12)
rfr.fit(x_train, y_train)
rfr_preds = rfr.predict(x_test)

In [None]:
mean_squared_error(y_true=y_test,y_pred=rfr_preds,squared=False)

#### KNN 

In [None]:
knn = KNeighborsRegressor()
knn.fit(x_train , y_train)
knn_preds = knn.predict(x_test)

In [None]:
mean_squared_error(y_true=y_test,y_pred=knn_preds,squared=False)

### Submission

In [None]:
test_preds = rfr.predict(test).astype(np.float64)
test_preds

In [None]:
sample_submission = cudf.DataFrame(sample_submission)
sample_submission['target'] = test_preds
sample_submission.to_csv('submission.csv', index=False)