In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
import gc, sys

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def reduceMemory(df):
          
   
    import numpy as np

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    return df

In [None]:
def BuildFeature(is_train=True):
   
    y = None
    test_idx = None
    
    if is_train: 
        print("Reading train.csv")
        df = pd.read_csv('../input/pubg-finish-placement-prediction/train_V2.csv') 
        df['headshots_per_kills'] = df['headshotKills'] / (df['kills']+0.00001)
        df['killstreak_per_kills'] = df['killStreaks'] / (df['kills'] + 0.00001)
        df['kills_assists'] = df['kills'] + df['assists']
        df['teamwork'] = df['revives'] + df['assists']
        df['heals_boosts'] = df['heals'] + df['boosts']
        df['total_distance'] = df['walkDistance'] + df['rideDistance'] + df['swimDistance']
        df['damageDealt_per_heal_boost'] = df['damageDealt'] / (df['heals_boosts'] + 1)
        df['road_kills_per_rideDistance'] = df['roadKills'] / (df['rideDistance'] + 0.01)
        df['assists_per_kill'] = df['assists'] / (df['kills'] + df['assists'] + 0.0001)
        df.fillna(0,inplace=True)
        df['walkdistance_per_second'] = df['walkDistance'] / df['matchDuration']
        df = df[df['maxPlace'] > 1]
    else:
        print("Reading test.csv")
        df = pd.read_csv('../input/pubg-finish-placement-prediction/test_V2.csv') 
        df['headshots_per_kills'] = df['headshotKills'] / (df['kills']+0.00001)
        df['killstreak_per_kills'] = df['killStreaks'] / (df['kills'] + 0.00001)
        df['kills_assists'] = df['kills'] + df['assists']
        df['teamwork'] = df['revives'] + df['assists']
        df['heals_boosts'] = df['heals'] + df['boosts']
        df['total_distance'] = df['walkDistance'] + df['rideDistance'] + df['swimDistance']
        df['damageDealt_per_heal_boost'] = df['damageDealt'] / (df['heals_boosts'] + 1)
        df['road_kills_per_rideDistance'] = df['roadKills'] / (df['rideDistance'] + 0.01)
        df['assists_per_kill'] = df['assists'] / (df['kills'] + df['assists'] + 0.0001)
        df.fillna(0,inplace=True)
        df['walkdistance_per_second'] = df['walkDistance'] / df['matchDuration']
        df = df[df['maxPlace'] > 1]
        test_idx = df.Id
        test_idx = df.Id
    
    # Reduce the memory usage
    df = reduceMemory(df)
    
    print("Delete Unuseful Columns")
    target = 'winPlacePerc'
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")  
    
    if is_train: 
        print("Read Labels")
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    print("Read Group mean features")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    if is_train:
        df_out = agg.reset_index()[['matchId','groupId']]
    else:
        df_out = df[['matchId','groupId']]
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])

    print("Read Group max features")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    print("Read Group min features")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    print("Read Group size features")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    print("Read Match mean features")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print("Read Match size features")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    X = df_out
    feature_names = list(df_out.columns)
    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y, feature_names, test_idx


In [None]:
X_train, y_train, train_columns, _ = BuildFeature(is_train=True)
X_test, _, _ , test_idx = BuildFeature(is_train=False)

In [None]:
X_train = reduceMemory(X_train)
X_test = reduceMemory(X_test)

In [None]:
import lightgbm as lgb 
import optuna
import sklearn.metrics
from xgboost import XGBRegressor
from optuna.integration import XGBoostPruningCallback
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
best_booster = None
gbm = None
def objective(trial,random_state=22,n_jobs=1,early_stopping_rounds=50):
    
    regrosser_name = trial.suggest_categorical("regressor", ["XGBoost", "lightgbm"])
    train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)
    # Step 2. Setup values for the hyperparameters:
    if regrosser_name == 'XGBoost':
        params = {
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "objective": "reg:squarederror",
        "n_estimators": 10000,
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
        "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
        "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
        "seed": random_state,
        "n_jobs": n_jobs,
        }
        model = XGBRegressor(**params)
        model.fit(train_x, train_y)
        y_pred = model.predict(X_val)
        accuracy_rf = sklearn.metrics.mean_absolute_error(valid_y, y_pred)
        return accuracy_rf
    
        print(rf_max_depth)
        print(rf_n_estimators)
        
    else:
        param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        }
        gbm = lgb.train(param, dtrain)
        preds_gbm = gbm.predict(valid_x)
        pred_labels_gbm = np.rint(preds_gbm)
        accuracy_gbm = sklearn.metrics.mean_absolute_error(valid_y, pred_labels_gbm)
        return accuracy_gbm
    

    # Step 3: Scoring method:
    
   
    print('accuracy_gbm=',accuracy_gbm)
    print('accurasy_rf=',accuracy_rf)
    print('rf_max_depth',rf_max_depth)
    print('rf_n_estimators',rf_n_estimators)

# Step 4: Running it


In [None]:
def callback(study, trial):
    global best_booster
    if study.best_trial == trial:
        best_booster = gbm

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, callbacks=[callback])

In [None]:
best_model = objective.best_booster

In [None]:
best_params=study.best_params

print(f"The best trial is : \n{study.best_trial}")
print(f"The best parameters are : \n{study.best_params}")
print(f"The best trial is : \n{study.StudySummary}")

In [None]:
del X_train
del y_train
gc.collect()

In [None]:
y_pred_test =best_model.predict(X_test)

In [None]:
df_train = reduceMemory(pd.read_csv('../input/pubg-finish-placement-prediction/train_V2.csv'))
df_test = reduceMemory(pd.read_csv('../input/pubg-finish-placement-prediction/test_V2.csv'))

In [None]:
y_pred_test[y_pred_test>1] = 1 
y_pred_test[y_pred_test<0] = 0

In [None]:
df_test['winPlacePerc'] = y_pred_test
submission = df_test[['Id', 'winPlacePerc']]
submission.to_csv('submission.csv', index=False)