In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import time
import gc
import os
from sklearn.model_selection import KFold
pd.set_option('max_columns',100)

# Lectura de Datos

In [2]:
train = pd.read_csv('train_V2.csv')
subm = pd.read_csv('sample_submission_V2.csv')
test = pd.read_csv('test_V2.csv')

In [3]:
train.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [4]:
test.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints
0,9329eb41e215eb,676b23c24e70d6,45b576ab7daa7f,0,0,51.46,0,0,0,73,0,0,0,0.0,1884,squad-fpp,28,28,1500,0,0.0,0,0.0,0,0,588.0,1,0
1,639bd0dcd7bda8,430933124148dd,42a9a0b906c928,0,4,179.1,0,0,2,11,0,2,1,361.9,1811,duo-fpp,48,47,1503,2,4669.0,0,0.0,0,0,2017.0,6,0
2,63d5c8ef8dfe91,0b45f5db20ba99,87e7e4477a048e,1,0,23.4,0,0,4,49,0,0,0,0.0,1793,squad-fpp,28,27,1565,0,0.0,0,0.0,0,0,787.8,4,0
3,cf5b81422591d1,b7497dbdc77f4a,1b9a94f1af67f1,0,0,65.52,0,0,0,54,0,0,0,0.0,1834,duo-fpp,45,44,1465,0,0.0,0,0.0,0,0,1812.0,3,0
4,ee6a295187ba21,6604ce20a1d230,40754a93016066,0,4,330.2,1,2,1,7,0,3,1,60.06,1326,squad-fpp,28,27,1480,1,0.0,0,0.0,0,0,2963.0,4,0


In [5]:
#dimension de la data
print(train.shape)
print(test.shape)

(4446966, 29)
(1934174, 28)


In [6]:
#veamos si hay alguna variable que no cuadra con el test set
len(train.columns.intersection(test.columns))

28

In [7]:
#valores vacios
def missing_values(df):
    missing = df.isnull().sum()
    missing = pd.DataFrame({'missing':missing})
    missing['percentage'] = (missing/df.shape[0])*100
    missing.sort_values('percentage',ascending=False,inplace=True)
    return missing

In [8]:
missing_values(train).head()

Unnamed: 0,missing,percentage
winPlacePerc,1,2.2e-05
matchType,0,0.0
winPoints,0,0.0
weaponsAcquired,0,0.0
walkDistance,0,0.0


In [9]:
missing_values(test).head()

Unnamed: 0,missing,percentage
Id,0,0
groupId,0,0
weaponsAcquired,0,0
walkDistance,0,0
vehicleDestroys,0,0


In [10]:
#tipos de las columnas
train.dtypes

Id                  object
groupId             object
matchId             object
assists              int64
boosts               int64
damageDealt        float64
DBNOs                int64
headshotKills        int64
heals                int64
killPlace            int64
killPoints           int64
kills                int64
killStreaks          int64
longestKill        float64
matchDuration        int64
matchType           object
maxPlace             int64
numGroups            int64
rankPoints           int64
revives              int64
rideDistance       float64
roadKills            int64
swimDistance       float64
teamKills            int64
vehicleDestroys      int64
walkDistance       float64
weaponsAcquired      int64
winPoints            int64
winPlacePerc       float64
dtype: object

In [11]:
#variable de respuesta y eliminar columnas irrelevantes para el modelo
target = train['winPlacePerc']
train.drop(['Id','winPlacePerc','matchType','rankPoints'],axis=1,inplace=True)
test.drop(['Id','matchType','rankPoints'],axis=1,inplace=True)

# Feature Engineering

In [None]:
def featureEngineering(df):
    df_size = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_mean = df.groupby(['matchId','groupId']).mean().reset_index()
    df_max = df.groupby(['matchId','groupId']).max().reset_index()
    df_min = df.groupby(['matchId','groupId']).min().reset_index()
    df_match_mean = df.groupby(['matchId']).mean().reset_index()
    df_train_max_PG = df.groupby(['matchId','groupId'])['kills'].count().reset_index().groupby('matchId')['kills'].max().reset_index()
    df_train_max_PG.columns = ['matchId','max_players_in_group']
    
    df = pd.merge(df,df_mean,suffixes=["","_mean"],how='left',on=['matchId','groupId'])
    df = pd.merge(df,df_max,suffixes=["","_max"],how='left',on=['matchId','groupId'])
    df = pd.merge(df,df_min, suffixes =["","_min"],how='left',on=['matchId','groupId'])
    df = pd.merge(df, df_match_mean, suffixes=["","_match_mean"],how='left',on=['matchId'])
    df = pd.merge(df, df_size,how ='left',on =['matchId','groupId'])
    df = pd.merge(df, df_train_max_PG, how ='left',on=['matchId'])
    return df

In [None]:
train = featureEngineering(train)
test = featureEngineering(test)

In [None]:
#excluir el id del partido y grupo correspondiente al usuario(no lo usaremos para el modelo)
train.drop(['matchId','groupId'],axis=1,inplace=True)
test.drop(['matchId','groupId'],axis=1,inplace=True)
train_columns_new = [name for name in train.columns if "_" in name]

In [None]:
#usaremos solo las variables extraidas del partido y grupo de partida
train = train[train_columns_new]
test = test[train_columns_new]

In [None]:
print(train.dtypes.value_counts())
print("--------------------------")
print(test.dtypes.value_counts())

In [None]:
#reeplazar el valor faltante con un 0
target.fillna(0,inplace=True)

# Modelo

In [None]:
# modelo light gradient boosting
def run_lgb(train,target,test):
    X_train, X_test,y_train,y_test = train_test_split(train,target,test_size=0.2,random_state=2)
    fold_preds = np.zeros(test.shape[0])
    params = {
        'boosting_type':'gbdt',
    'learning_rate': 0.1, 
    'max_depth': -1,
    'num_leaves': 30,
    'feature_fraction': 0.9,
    'subsample': 0.8,
    'min_data_in_leaf': 100,
    'lambda_l2': 4,
    'objective': 'regression_l2', 
    'zero_as_missing': True,
    'metric': 'mae',
    'seed': 2}
    
    lgtrain = lgb.Dataset(X_train, label=y_train)
    lgval = lgb.Dataset(X_test, label=y_test)
    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgtrain,lgval], early_stopping_rounds=100, verbose_eval=500)
    fold_preds += model.predict(test, num_iteration=model.best_iteration)
    return fold_preds, model

In [None]:
fold_preds, model = run_lgb(train,target,test)

In [None]:
fold_preds[fold_preds > 1] = 1
fold_preds[fold_preds < 0] = 0
subm['winPlacePerc'] = fold_preds

In [None]:
subm.to_csv("lgb_baseline.csv", index=False)