## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
import gc, sys
gc.enable()

## Exploratory Data Analysis (EDA)

### Reading dataset:
Lets have a look into our dataset

In [None]:
df = pd.read_csv('../input/pubg-finish-placement-prediction/train_V2.csv')
df.head()

In [None]:
# shape of the data
df.shape

In [None]:
#some info of the dataset
df.info()

In [None]:
df.describe()

In [None]:
# checking for null values
df.isnull().sum()

That's a pretty much of a clean data.

## Visualization of some attributes.

In [None]:
headshot_kill = df[df['headshotKills']>0]
plt.figure(figsize=(15,5))
sns.countplot(headshot_kill['headshotKills'].sort_values())

In [None]:
new_df=df[df['kills']>0]
new_df=new_df[new_df['headshotKills']>0]
sns.lineplot(x='headshotKills', y='kills', data=new_df)

obvious conclusion for the relation of the headshots and kills during the gameplay.

In [None]:
sns.lineplot(x='DBNOs', y='kills', data=df)

Obvious variation. We all try to kill the enemy whom we had knocked out. But sometimes their team members revives them, so similarly for such cases we will be having damage dealt higher even if the number of kills is lower

In [None]:
sns.jointplot(x='winPlacePerc', y='killStreaks', data=df, ratio=3, color='b', )
plt.show()

Here we can see that teams or players with high kill streaks have a good chance for bagging the chicken dinner.

In [None]:
plt.figure(figsize=(15,10))
sns.lineplot(x='winPlacePerc', y='kills', data=df)

Variation of kills and winplacepercent. It shows variation across the teams too who have low winplaceperc. but the graphs looks good ;)

In [None]:
xy=df[df['winPlacePerc']==1]
xy=xy[xy['damageDealt']<1000]
xy

Here this dataset shows that there are many teams who have dealt less damage even less than 1000 still their winplaceperc is 1. And some players with damage dealt with even lower than a single person that is less than 100. Which may shows that the whole game they just might kept running only too the safe zones or they just kept hidding until the last enemy team is left, which maay be considered as a tactic but not a fairplay. XD

In [None]:
sns.jointplot(x='winPlacePerc', y='DBNOs', data=df, ratio=3, color='r')
plt.show()

In [None]:
sns.jointplot(x='winPlacePerc', y='teamKills', data=df, ratio=3, color='g')

Here the comparison between the team kills and winplaceperc we can see sometimes the team/solo players who kills many team but they also gget killed because of exxcitement and barging at a full speed. Here the most variation can be seen at the most 2 exxtreme ends

In [None]:
f,ax = plt.subplots(figsize=(15, 15))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax)

### Since the data is huge we had memory usage problem. And this `reduce_mem_usage` funtion helps us to tackle that problem.

In [None]:
# Reduce the usage of memory
# Ref: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Feature Engineering

In [None]:
def feature_engineering(is_train=True):
    if is_train: 
        print("processing train.csv")
        df = pd.read_csv("../input/pubg-finish-placement-prediction/train_V2.csv")

        df = df[df['maxPlace'] > 1]
    else:
        print("processing test.csv")
        df = pd.read_csv("../input/pubg-finish-placement-prediction/test_V2.csv")
        
    
    # df = reduce_mem_usage(df)
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    
    # df = df[:100]
    
    print("remove some columns")
    target = 'winPlacePerc'
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    
    features.remove("matchType")
    
    
    y = None
    
    print("get target")
    if is_train: 
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    print("get group mean feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    if is_train: df_out = agg.reset_index()[['matchId','groupId']]
    else: df_out = df[['matchId','groupId']]

    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
    print("get group sum feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('sum')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_sum", "_sum_rank"], how='left', on=['matchId', 'groupId'])
    
    df_out=reduce_mem_usage(df_out)
    
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    print("get group min feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    df_out=reduce_mem_usage(df_out)
    
    print("get group size feature")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    print("get match mean feature")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print("get match size feature")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    df_out=reduce_mem_usage(df_out)
    
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    
    

    X = np.array(df_out)
    
    feature_names = list(df_out.columns)

    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y, feature_names

In [None]:
x_train, y, feature_names = feature_engineering(True)

### Splitting data into trainig and validation set.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split (x_train, y, test_size=0.33, random_state=42)

## Importing Libraries for model building

In [None]:
import os
import time
import warnings
warnings.filterwarnings("ignore")
# data manipulation

import numpy as np
import pandas as pd
# plot
import matplotlib.pyplot as plt
# model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

## Model Structure

In [None]:
train_data=lgb.Dataset(X_train, label=y_train)
val_data= lgb.Dataset(X_val, label=y_val)

params = {
    'num_leaves': 144,
    'learning_rate': 0.1,
    'n_estimators': 1500,
    'max_depth':12,
    'max_bin':55,
    'bagging_fraction':0.8,
    'bagging_freq':5,
    'feature_fraction':0.9,
    'verbose':50, 
    'early_stopping_rounds':100
    }

params['metric'] = 'auc'
lgb_model= lgb.train(params, train_data, valid_sets=val_data, num_boost_round=3000, early_stopping_rounds=100)
y_pred=lgb_model.predict(X_val)
print(mean_absolute_error(y_val,y_pred))

### Prediction on Validation dataset

In [None]:
y_pred=lgb_model.predict(X_val)
print(mean_absolute_error(y_val,y_pred))
del X_val
del y_val

In [None]:
x_test, y_test, feature_names = feature_engineering(False)

In [None]:
y_test_pred=lgb_model.predict(x_test)

### Saving submission.csv file

In [None]:
df=pd.read_csv("../input/pubg-finish-placement-prediction/test_V2.csv")
var=pd.DataFrame(columns=['Id','winPlacePerc'])
var['Id']= df['Id']

var['winPlacePerc'] = y_test_pred

submission = var[['Id', 'winPlacePerc']]
submission.to_csv('submission.csv', index=False)