In [None]:
!pip install fastai==0.7.0 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from fastai.imports import *
from fastai.structured import *   ## Need to install fastai 0.7 for this 

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn import metrics

import os
print(os.listdir("../input"))

In [None]:
df_raw = pd.read_csv('../input/train_V2.csv', low_memory=False)
df_raw_test = pd.read_csv('../input/test_V2.csv', low_memory=False)

In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 100, "display.max_columns", 100):
        display(df)

In [None]:
display_all(df_raw.tail())

In [None]:
display_all(df_raw.describe(include='all'))

In [None]:
# store test info
df_raw_test_info = df_raw_test[['Id', 'groupId', 'matchId']]

In [None]:
df_raw.drop(['Id', 'groupId', 'matchId'], axis=1, inplace=True)
df_raw_test.drop(['Id', 'groupId', 'matchId'], axis=1, inplace=True)

In [None]:
train_cats(df_raw)
apply_cats(df_raw_test, df_raw)

In [None]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

In [None]:
df_raw[pd.isna(df_raw['winPlacePerc'])]

In [None]:
df_raw.dropna(subset=['winPlacePerc'], inplace=True)

In [None]:
df_trn, y_trn, nas = proc_df(df_raw, 'winPlacePerc')
df_test, _, _ = proc_df(df_raw_test, na_dict=nas)

In [None]:
# split the data to train valid

X_train, X_valid, y_train, y_valid = train_test_split(df_trn, y_trn, test_size=0.33, random_state=42)

In [None]:
from sklearn.metrics import mean_absolute_error

def print_score(m):
    res = [mean_absolute_error(m.predict(X_train), y_train), mean_absolute_error(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [None]:
set_rf_samples(20000)
m = RandomForestRegressor(n_jobs=-1, n_estimators = 40, min_samples_leaf = 7, min_samples_split = 7)
%time m.fit(X_train, y_train)

In [None]:
%time print_score(m)

In [None]:
pred = m.predict(df_test)
pred

In [None]:
df_sub = df_raw_test_info[['Id']]

In [None]:
pd.options.mode.chained_assignment = None  # default='warn' ## TO remove warning due to assignment below
df_sub['winPlacePerc'] = pred

In [None]:
df_sub.to_csv('submission.csv', index=None)

In [None]:
fi = rf_feat_importance(m, X_train)
fi[:10]

In [None]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False)

In [None]:
def plot_fi(fi): return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)
plot_fi(fi[:30])

In [None]:
to_keep = fi[fi.imp>0.005].cols; len(to_keep)

In [None]:
df_keep = df_raw[to_keep].copy()
X_train, X_valid, y_train, y_valid = train_test_split(df_keep, y_trn, test_size=0.33, random_state=42)

In [None]:
set_rf_samples(20000)
m = RandomForestRegressor(n_jobs=-1, n_estimators = 40, min_samples_leaf = 7, min_samples_split = 7)
%time m.fit(X_train, y_train)

In [None]:
%time print_score(m)