## November TPS - Random Forest Baseline

In [None]:
import datatable as dt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelBinarizer

#### Add Intel patch to increase sklearn's speed
The dataset is large enough that this patch could increase the speed of the RandomForestClassifier.

In [None]:
# Use this line to install Intel's update to library if needed
!pip install scikit-learn-intelex --progress-bar off >> /tmp/pip_sklearnex.log

from sklearnex import patch_sklearn
patch_sklearn()

## Data Handling

In [None]:
df = dt.fread('/kaggle/input/tabular-playground-series-nov-2021/train.csv').to_pandas()

In [None]:
y = df['target'] # Target vector
X = df.drop(columns=['target'])

lb = LabelBinarizer() # Need to turn target (true and false) into 1's and 0's
y = np.ravel(lb.fit_transform(y)) # ravel makes y a 1d vector instead of a column vector

## Establish a Baseline with a Random Forest Classifier

We are choosing a classifier model since the targets for this dataset are True/False values.

#### Cross Validation

In [None]:
def evaluate_model(X, y, model):
    cv_method = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
    
    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv_method, error_score='raise')
    return scores

In [None]:
model_1 = RandomForestClassifier(n_estimators=500, max_samples=0.95, max_depth=5, n_jobs=-1)

In [None]:
%%time
scores_1 = evaluate_model(X, y, model_1)
print("Average Score: ", np.mean(scores_1))

### Making predictions and creating submission

In [None]:
model_1.fit(X, y)

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')
pd.DataFrame({'id': test_df.id, 'target': model_1.predict_proba(test_df)[:,1]}).to_csv('submission.csv', index=False)