In [None]:
import numpy as np 
import pandas as pd 
import gc

import os
print(os.listdir("../input"))

In [None]:
print('# File sizes')
for f in os.listdir('../input'):
    if 'zip' not in f:
        print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB')
        
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
                    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# %%time
# df_train  = reduce_mem_usage(pd.read_csv("../input/train_V2.csv"))
# gc.collect()
df_train = pd.read_csv("../input/train_V2.csv")

In [None]:
# %%time
# df_test  = reduce_mem_usage(pd.read_csv("../input/test_V2.csv"))
# gc.collect()
df_test = pd.read_csv("../input/test_V2.csv")

In [None]:
df_train.head().T

In [None]:
df_test.head().T

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.describe().T

In [None]:
df_test.describe().T

In [None]:
df_train_missing_values_count = df_train.isnull().sum()
df_train_missing_values_count

In [None]:
df_test_missing_values_count = df_test.isnull().sum()
df_test_missing_values_count

In [None]:
# drop rows with NaN
df_train = df_train.dropna(axis=0)

In [None]:
# feature selection
features = df_train.columns.drop(["winPlacePerc", "Id", "groupId", "matchId"])
train_X = df_train[features]

train_y = df_train['winPlacePerc']

test_X = df_test[features]

# one hot encode
train_X = pd.get_dummies(train_X)
test_X = pd.get_dummies(test_X)

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=42, n_estimators=1000)
forest_model.fit(train_X, train_y)
predict_y = forest_model.predict(test_X)
predict_y

In [None]:
output = pd.DataFrame({'Id': df_test.Id,
                       'winPlacePerc': predict_y})

output.to_csv('submission.csv', index=False)
output