In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/train_V2.csv')
test = pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/test_V2.csv')
target = train['winPlacePerc']
train.drop(['winPlacePerc'], axis=1, inplace=True)
merge = pd.concat([train, test])

In [None]:
train.tail()

In [None]:
test.tail()

In [None]:
merge.head()

In [None]:
merge.tail()

In [None]:
def rstr(df, pred=None):
    obs = df.shape[0]
    types = df.dtypes
    counts = df.apply(lambda x: x.count())
    uniques = df.apply(lambda x: [x.unique()])
    nulls = df.apply(lambda x: x.isnull().sum())
    distincts = df.apply(lambda x: x.unique().shape[0])
    missing_ration = (df.isnull().sum()/obs) *100
    skewness = df.skew()
    kurtosis = df.kurt()
    print('Data shape: ', df.shape)
    
    if pred is None:
        cols = ['types', 'counts', 'distincts', 'nulls', 'missing ration', 'uniques', 'skewness', 'kurtosis', 'corr']
        str = pd.concat([types, counts, distincts, nulls, missing_ration, uniques, skewness, kurtosis], axis=1)
    else:
        corr = df.corr()[pred]
        str =pd.concat([types, counts, distincts, nulls, missing_ration, uniques, skewness, kurtosis, corr], axis=1, sort=False)
        corr_col = 'corr ' + pred
        cols = ['types', 'counts', 'distincts', 'nulls', 'missing ration', 'uniques', 'skewness', 'kurtosis',  corr_col]
    str.columns = cols
    dtypes = str.types.value_counts()
    print('___________________________\nData types:\n',str.types.value_counts())
    print('___________________________')
    return str

In [None]:
# pd.set_option('display.max_rows', None)
# details = rstr(merge, 'winPlacePerc')
# display(details.sort_values(by='corr winPlacePerc', ascending=False))

In [None]:
merge["group_size"] = merge.groupby(["matchId"])['Id'].transform('count')
merge["killPlace"] = merge["killPlace"]/merge["group_size"]
merge["distanceSum"] = 0.81*merge['walkDistance'] + 0.34*merge['rideDistance'] + 0.14*merge['swimDistance']
merge["item"] = 0.63*merge['boosts'] + 0.42*merge['heals']
merge["killPlace"] = -1*merge["killPlace"]
merge["headshotRatio"] = merge["headshotKills"]/merge["kills"]
merge["killStreakRatio"] = merge["killStreaks"]/merge["kills"]
merge["headshotRatio"].fillna(0,inplace=True)
merge["killStreakRatio"].fillna(0, inplace=True)
merge["killSum"] = 0.41*merge["kills"]+0.28*merge["DBNOs"]+0.3*merge["assists"]-0.01*merge["teamKills"]
merge["distanceMean"] = merge.groupby(["groupId"])['distanceSum'].transform(np.mean)
merge["itemMean"] = merge.groupby(["groupId"])["item"].transform(np.mean)
merge["killMean"] = merge.groupby(["groupId"])["killSum"].transform(np.mean)
merge["killPlaceMean"] = merge.groupby(["groupId"])["killPlace"].transform(np.mean)
merge["killPlaceMean"].fillna(0, inplace=True)
merge["damageDealtMean"] = merge.groupby(["groupId"])["damageDealt"].transform(np.mean)
merge["damageDealtMean"].fillna(0,inplace=True)
merge["revivesMean"] = merge.groupby(["groupId"])["revives"].transform(np.mean)
merge["meanWeapon"] = merge.groupby(["groupId"])["weaponsAcquired"].transform(np.mean)
merge["streakAndHeadshot"] = (0.23*merge["killStreakRatio"]+0.18*merge["headshotRatio"])/2
merge.drop(["killStreakRatio","headshotRatio"], axis=1, inplace=True)
merge["streakAndHeadshotMean"] = np.sqrt(merge.groupby(["groupId"])["streakAndHeadshot"].transform(np.mean))
merge["streakAndHeadshotMean"].replace([np.inf, -np.inf], np.nan, inplace=True)
merge["streakAndHeadshotMean"].fillna(0, inplace=True)
merge.drop(["streakAndHeadshot"], axis=1, inplace=True)
merge['longestKill'] = merge.groupby(["groupId"])["longestKill"].transform(np.mean)
etc = merge[['numGroups', 'Id', 'groupId','matchId', 'matchType']]
merge.drop(['vehicleDestroys','numGroups', 'Id', 'groupId','matchId', 'matchType','rankPoints','killPoints','winPoints','matchDuration',
           'roadKills','teamKills','maxPlace','item','kills','assists','DBNOs','killSum','killStreaks','headshotKills',
           'group_size','walkDistance', 'rideDistance','swimDistance','boosts', 'heals','damageDealt','killPlace','revives','distanceSum','weaponsAcquired'], axis=1, inplace=True)

In [None]:
merge['revivesMean'] = np.sqrt(merge['revivesMean'])
merge['revivesMean'].replace([np.inf, -np.inf], np.nan, inplace=True)
merge['revivesMean'].fillna(0, inplace=True)

merge['meanWeapon'] = np.sqrt(merge['meanWeapon'])
merge['meanWeapon'].replace([np.inf, -np.inf], np.nan, inplace=True)
merge['meanWeapon'].fillna(0, inplace=True)

merge['damageDealtMean'] = np.sqrt(merge['damageDealtMean'])
merge['damageDealtMean'].replace([np.inf, -np.inf], np.nan, inplace=True)
merge['damageDealtMean'].fillna(0, inplace=True)

merge['killMean'] = np.sqrt(merge['killMean'])
merge['killMean'].replace([np.inf, -np.inf], np.nan, inplace=True)
merge['killMean'].fillna(0, inplace=True)

In [None]:
merge['longestKill'] = np.sqrt(merge['longestKill'])

In [None]:
train = merge.iloc[:4446965,:]
test = merge.iloc[4446966:,]

In [None]:
train = pd.concat([train, target],axis=1)

In [None]:
pd.set_option('display.max_rows', None)
details = rstr(train, 'winPlacePerc')
display(details.sort_values(by='corr winPlacePerc', ascending=False))

In [None]:
plt.figure(figsize=(15,15))
hm = sns.heatmap(train.corr(), annot=True, annot_kws={'size':14})
bottom, top = hm.get_ylim()
hm.set_ylim(bottom+0.5, top-0.5)
plt.tight_layout()
plt.show()

In [None]:
target = train['winPlacePerc']
target = target.fillna(target.mean())
train.drop('winPlacePerc', axis=1, inplace=True)

In [None]:
train.isnull().sum()

In [None]:
train = train.fillna(train.mean())

In [None]:
train.isnull().sum()

In [None]:
target.isnull().sum()

In [None]:
# x_train, x_test, y_train, y_test = train_test_split(train, target, train_size = 0.9, test_size = 0.1, random_state = 25) ## train,test size arranged

In [None]:
# lr = LinearRegression()
# ridge = Ridge(alpha = 1.0)
Xgb = XGBRegressor(max_depth=15,n_jobs=-1,n_estimators=2000,tree_method='gpu_hist',eval_metric='mae',
                   sampling_method='gradient_based', sample_type='weighted',subsample=0.8, learning_rate=0.2, reg_lambda=2.0)

In [None]:
# lr_model = lr.fit(x_train, y_train)
# ridge_model = ridge.fit(x_train, y_train)
# xgb_model = Xgb.fit(x_train, y_train)

In [None]:
# print("훈련 스코어(lr)     : %.4f" % lr_model.score(x_train, y_train))
# print("훈련 스코어(ridge)  : %.4f" % ridge_model.score(x_train, y_train))
# print("훈련 스코어(xgb)    : %.4f" % xgb_model.score(x_train, y_train))

In [None]:
# print("예측 스코어(lr)     : %.4f" % r2_score(y_test, lr_model.predict(x_test))) 
# print("예측 스코어(ridge)  : %.4f" % r2_score(y_test, ridge_model.predict(x_test)))
# print("예측 스코어(xgb)    : %.4f" % r2_score(y_test, xgb_model.predict(x_test)))

In [None]:
# print("예측 mae(lr)     : %.4f" % mean_absolute_error(y_test, lr_model.predict(x_test))) 
# print("예측 mae(ridge)  : %.4f" % mean_absolute_error(y_test, ridge_model.predict(x_test))) 
# print("예측 mae(xgb)    : %.4f" % mean_absolute_error(y_test, xgb_model.predict(x_test)))

In [None]:
Xgb_model_final = Xgb.fit(train, target)

In [None]:
answer = Xgb_model_final.predict(test)

In [None]:
test_v2 =  pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/test_V2.csv')

In [None]:
id = test_v2['Id']

In [None]:
submission = pd.DataFrame({'Id':id, 'winPlacePerc':answer})

In [None]:
submission.tail()

In [None]:
submission.to_csv("submission.csv", index=False)