In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [None]:
sample_submission_V2 = pd.read_csv("../input/pubg-finish-placement-prediction/sample_submission_V2.csv")
test_raw = pd.read_csv("../input/pubg-finish-placement-prediction/test_V2.csv")
data_raw = pd.read_csv("../input/pubg-finish-placement-prediction/train_V2.csv")

In [None]:
data_raw.shape

In [None]:
test_raw.shape

In [None]:
sample_submission_V2.shape

In [None]:
sp = data_raw.shape
nrows = int(round((sp[0]*0.25),0))
print(nrows)

In [None]:
# Criei uma Sample do dataset de treino com 4446966 registros, a VM do Kaggle não aguenta essa quantidade de registro, portanto utilizei 25% ou 1111742 registros
data_raw = data_raw.sample(n=nrows, random_state=42)

In [None]:
def transform_dataset(data):
    data['totalDistance'] = data.rideDistance+data.swimDistance+data.walkDistance #Total da distancia percorrida pelo jogador (Andar, Dirigir e Nadar)
    data['playersJoined'] = data.groupby('matchId')['matchId'].transform('count') # Quantidade de jogadores na partida
    data['killsNorm'] = data['kills']*((100-data['playersJoined'])/100 + 1) # Indice de Kill por jogadores na partida
    data['damageDealtNorm'] = data['damageDealt']*((100-data['playersJoined'])/100 + 1) # Indice de dano por jogadores
    data['healsAndBoosts'] = data['heals']+data['boosts'] # Soma do uso de itens de cura e estamina
    data['boostsPerWalkDistance'] = data['boosts']/(data['walkDistance']+1) # Razão itens de estamina por caminho percorrido pelo jogador.
    data['boostsPerWalkDistance'].fillna(0, inplace=True)
    data['healsPerWalkDistance'] = data['heals']/(data['walkDistance']+1) # Razão itens de cura por caminho percorrido pelo jogador.
    data['healsPerWalkDistance'].fillna(0, inplace=True)
    data['healsAndBoostsPerWalkDistance'] = data['healsAndBoosts']/(data['walkDistance']+1) #Razão itens de cura e estamina por caminho percorrido pelo jogador.
    data['healsAndBoostsPerWalkDistance'].fillna(0, inplace=True)
    data['killsPerWalkDistance'] = data['kills']/(data['walkDistance']+1) #Razão kills por caminho percorrido pelo jogador.
    data['killsPerWalkDistance'].fillna(0, inplace=True)
    data['team'] = [1 if i>50 else 2 if (i>25 & i<=50) else 4 for i in data['numGroups']] # Numero de grupos de jogadores na partida
    return data

In [None]:
data  = transform_dataset(data_raw)
test = transform_dataset(test_raw)

In [None]:
data.head(5)

In [None]:
test.head(5)

In [None]:
data.shape

In [None]:
test.shape

In [None]:
data.dtypes

In [None]:
data.dtypes

In [None]:
remove = ['Id', 'groupId','matchId','winPlacePerc', 'damageDealt', 'kills', 'matchType','winPoints']
feats = [col for col in data.columns if col not in remove]

In [None]:
train_features, test_features = train_test_split(data,  test_size = 0.25,random_state = 42)

In [None]:
rfr = RandomForestRegressor(n_estimators=24, random_state=42, min_samples_leaf=5, max_features=0.6, n_jobs=-1, oob_score=True)

In [None]:
%time rfr.fit(train_features[feats], train_features.winPlacePerc)

In [None]:
predictions = rfr.predict(test_features[feats])

In [None]:
mse = np.mean((predictions - test_features.winPlacePerc)**2)
print (mse)

In [None]:
from sklearn import metrics
metrics.mean_squared_error(predictions,test_features.winPlacePerc)

In [None]:
metrics.explained_variance_score(predictions,test_features.winPlacePerc)

In [None]:
metrics.r2_score(predictions,test_features.winPlacePerc,)

In [None]:
predictions[0:10]

In [None]:
test_features.winPlacePerc.head(10)

0.008190903198586506

In [None]:
feature_importances = pd.DataFrame(rfr.feature_importances_,
                                   index = train_features[feats].columns,
                                    columns=['importance']).sort_values('importance',ascending=False)

In [None]:
feature_importances.plot.bar()

In [None]:
feature_importances

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBRegressor(objective = "reg:linear", 
                             random_state = 42, 
                             subsample = 1, 
                             tree_method = 'approx', 
                             eta = 0.1, 
                            max_depth = 5,
                            n_jobs = -1)

In [None]:
#test_features.matchType = test_features.matchType.astype('int32')
#train_features.matchType = train_features.matchType.astype('int32')

In [None]:
train_features[feats].info()

In [None]:
%time xgb_model.fit(train_features[feats], train_features.winPlacePerc)

In [None]:
y_pred = xgb_model.predict(test_features[feats])

In [None]:
from sklearn import metrics
metrics.mean_squared_error(y_pred,test_features.winPlacePerc)

In [None]:
metrics.explained_variance_score(y_pred,test_features.winPlacePerc)

In [None]:
metrics.r2_score(y_pred,test_features.winPlacePerc)

In [None]:
predict_test = xgb_model.predict(test[feats])

In [None]:
predict_test.shape

In [None]:
sample_submission_V2.shape

In [None]:
sample_submission_V2.head(5)

In [None]:
sample_submission_V2['winPlacePerc'] = predict_test

In [None]:
sample_submission_V2.head(5)

In [None]:
sample_submission_V2.to_csv('submission.csv',index=False)