In [None]:
#LOAD REQUIRED LIBRARIES
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = pd.read_csv("../input/pubg-finish-placement-prediction/train_V2.csv")
train = reduce_mem_usage(train)
test = pd.read_csv("../input/pubg-finish-placement-prediction/test_V2.csv")
test = reduce_mem_usage(test)

In [None]:
#TOP 5 rows of dataset
train.head()

In [None]:
#SUMMARY OF TRAIN DATASET
train.info()

In [None]:
train.tail()

## Exploratory Data Analysis

In [None]:
#FIND OUT UNIQUE COUNT OF MATCH ID & GROUP ID
for i in ['Id','groupId','matchId']:
    print(f'unique [{i}] count:', train[i].nunique())

### Exploring Different Match Type
PUBG offers 3 different game modes:
- Solo - One can play alone (solo,solo-fpp,normal-solo,normal-solo-fpp)
- Duo - Play with a friend (duo,duo-fpp,normal-duo,normal-duo-fpp,crashfpp,crashtpp)
- Squad - Play with 4 friends (squad,squad-fpp,normal-squad,normal-squad-fpp,flarefpp,flaretpp)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

train.groupby('matchId')['matchType'].first().value_counts().plot.bar(ax=ax[0])

mapper = lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) or ('crash' in x) else 'squad'
train['matchType'] = train['matchType'].apply(mapper)
train.groupby('matchId')['matchType'].first().value_counts().plot.bar(ax=ax[1])

## Player Analysis in match and group

In [None]:
def mergeList(list1,list2):
    return list(itertools.product(list1,list2))
match = train.groupby(['matchType','matchId']).size().to_frame('players in match')
group = train.groupby(['matchType','matchId','groupId']).size().to_frame('players in group')
pd.concat([match.groupby('matchType').describe()[mergeList(['players in match'],['min','mean','max'])], 
           group.groupby('matchType').describe()[mergeList(['players in group'],['min','mean','max'])]], axis=1)

In [None]:
#MATCH ID OF MATCH TYPE WITH MAX. NO. OF PLAYERS ALONG WITH GROUP ID
print(group['players in group'].nlargest(5))

In [None]:
#DETAILS OF PLAYERS OF PARTICULAR MACTH ID & GROUP ID
subset = train[train['matchId']=='3e029737889ce9']
sub_grp = subset[subset['groupId']=='b8275198faa03b']

print('matchId ==\'3e029737889ce9\' & groupId ==\'b8275198faa03b\'')
print('-'*50)
print('players:',len(subset))
print('groups:',subset['groupId'].nunique())
print('numGroups:',subset['numGroups'].unique())
print('maxPlace:',subset['maxPlace'].unique())
print('-'*50)
print('max-group players:',len(sub_grp))
print('max-group winPlacePerc:',sub_grp['winPlacePerc'].unique())
print('-'*50)
print('winPlacePerc:',subset['winPlacePerc'].sort_values().unique())

In [None]:
#PLOT HEATMAP TO FIND OUT CORRELATION OF FEATURES WITH WINNING PERCENTAGE
f,ax = plt.subplots(figsize=(20, 15))
sns.heatmap(train.corr(), annot=True, fmt= '.1f',cmap="BrBG")
plt.show()

## Feature Engineering

In [None]:
#AVERAGE KILL BY A PLAYER
print("On an average a person kills {:.4f} players, 99% of people have {} kills or less, while the maximum kills ever recorded is {}.".format(train['kills'].mean(),train['kills'].quantile(0.99), train['kills'].max()))

In [None]:
df = train.copy()
df.loc[df['kills'] > df['kills'].quantile(0.99)] = '8+'
plt.figure(figsize=(15,8))
sns.countplot(df['kills'].astype('str').sort_values())
plt.title("Kill Count")
plt.show()

In [None]:
#DAMAGE DONE BY PLAYERS 
df = train.copy()
df = df[df['kills']==0]
plt.figure(figsize=(15,8))
plt.title("Damage Dealt by 0 killers")
sns.distplot(df['damageDealt'])
plt.show()

In [None]:
print("A total of {} players ({:.2f}%) have won without a single kill!".format(len(df[df['winPlacePerc']==1]), 100*len(df[df['winPlacePerc']==1])/len(train)))
df1 = train[train['damageDealt'] == 0].copy()
print("A total of {} players ({:.2f}%) have won without dealing damage!".format(len(df1[df1['winPlacePerc']==1]), 100*len(df1[df1['winPlacePerc']==1])/len(train)))

## Effect of Killing on Winning Percentage

In [None]:
kills = train.copy()
kills['killsCategories'] = pd.cut(kills['kills'], [-1, 0, 2, 5, 10, 60], labels=['0_kills','1-2_kills', '3-5_kills', '6-10_kills', '10+_kills'])
plt.figure(figsize=(15,8))
sns.boxplot(x="killsCategories", y="winPlacePerc", data=kills)
plt.show()

In [None]:
sns.jointplot(x="winPlacePerc", y="kills",  data=train, height=10, ratio=3, color="blue")
plt.show()

## Correlation of Walking Distance with Winning Percentage

In [None]:
print("{} players ({:.2f}%) walked 0 meters. This means that either they die before even taking a step or they have just joined the game but are away from keyboard.".format(len(df[df['walkDistance'] == 0]), 100*len(df1[df1['walkDistance']==0])/len(train)))

In [None]:
df = train.copy()
df = df[df['walkDistance'] < train['walkDistance'].quantile(0.99)]
plt.figure(figsize=(15,10))
plt.title("Walking Distance Distribution",fontsize=15)
sns.distplot(df['walkDistance'])
plt.show()

In [None]:
sns.jointplot(x="winPlacePerc", y="walkDistance",  data=train, height=10, ratio=3, color="blue")
plt.show()

## Analyze Riding Feature

In [None]:
df = train.copy()
df = df[df['rideDistance'] < train['rideDistance'].quantile(0.99)]
plt.figure(figsize=(15,8))
plt.title("Ride Distance Distribution",fontsize=15)
sns.distplot(df['rideDistance'])
plt.show()

In [None]:
f,ax1 = plt.subplots(figsize =(15,8))
sns.pointplot(x='vehicleDestroys',y='winPlacePerc',data=df,color='lime',alpha=0.5)
plt.xlabel('Count of Vehicle Destroys',fontsize = 16,color='blue')
plt.ylabel('Win Percentage',fontsize = 16,color='blue')
plt.title('Vehicle Destroyed/ Win Ratio',fontsize = 20,color='blue')
plt.grid()
plt.show()

## Analyzing Healing & Boosting on Winning Percentage

In [None]:
df = train.copy()
df = df[df['heals'] < df['heals'].quantile(0.99)]
df = df[df['boosts'] < df['boosts'].quantile(0.99)]

f,ax1 = plt.subplots(figsize =(15,8))
sns.pointplot(x='heals',y='winPlacePerc',data=df,color='lime',alpha=0.8)
sns.pointplot(x='boosts',y='winPlacePerc',data=df,color='blue',alpha=0.8)
plt.text(4,0.6,'Heals',color='lime',fontsize = 16,style = 'italic')
plt.text(4,0.55,'Boosts',color='blue',fontsize = 16,style = 'italic')
plt.xlabel('Number of heal/boost items',fontsize = 16,color='blue')
plt.ylabel('Win Percentage',fontsize = 16,color='blue')
plt.title('Heals vs Boosts',fontsize = 20,color='blue')
plt.grid()
plt.show()

In [None]:
sns.jointplot(x="winPlacePerc", y="heals", data=train, height=10, ratio=3, color="lime")
plt.show()

In [None]:
sns.jointplot(x="winPlacePerc", y="boosts", data=train, height=10, ratio=3, color="lime")
plt.show()

## Effect of Weapons Acquired 

In [None]:
df = train.copy()
df = df[df['weaponsAcquired'] < train['weaponsAcquired'].quantile(0.99)]

In [None]:
sns.jointplot(x="winPlacePerc", y="weaponsAcquired", data=train, height=10, ratio=3, color="lime")
plt.show()

## Detect Outliers

In [None]:
# Check row with NaN value
train[train['winPlacePerc'].isnull()]

In [None]:
#Drop NaN values
train.drop(train[train['winPlacePerc'].isnull()].index, inplace=True)
train.shape

## Generate New Features and Remove Outliers

In [None]:
# Engineer a new feature totalDistance
train['totalDistance'] = train['rideDistance'] + train['walkDistance'] + train['swimDistance']

In [None]:
# Engineer headshot_rate feature --- headshots made per kill
train['headshot_rate'] = train['headshotKills'] / train['kills']
train['headshot_rate'] = train['headshot_rate'].fillna(0)

In [None]:
#Defining some functions for plotting graphs, we will be needing a lot of countplot and distplot
def show_countplot(column):
    plt.figure(figsize=(15,8))
    sns.countplot(data=train, x=column).set_title(column)
    plt.show()
    
def show_distplot(column):
    plt.figure(figsize=(15, 8))
    sns.distplot(train[column], bins=50)
    plt.show()

In [None]:
# Is it even possible to kill more than 40 people by acquiring more than 55 weapons and maintaining a total distance of less than 100m?
train[(train['kills'] >= 40) & (train['weaponsAcquired'] > 55) & (train['totalDistance'] < 100.0)]

In [None]:
# Is it even possible to kill more than 40 people without using any heals?
train[(train['kills'] >= 40) & (train['heals'] == 0)]

In [None]:
# Drop 'fraudsters' with inhumane kill counts
train.drop(train[(train['kills'] >= 40) & (train['weaponsAcquired'] > 55) & (train['totalDistance'] < 100.0)].index, inplace=True)
train.drop(train[(train['kills'] >= 40) & (train['heals'] == 0)].index, inplace=True)

## 100% headshot kills

In [None]:
# Plot the distribution of headshot_rate
show_distplot('headshot_rate')

In [None]:
# List of Hitman who made more than 10 kills and all the kills were done by headshot(perfect kill)
train[(train['headshot_rate'] == 1) & (train['kills'] >= 10)].head()

## Longest Kill

In [None]:
show_distplot('longestKill')

In [None]:
# Drop longestKill 'fraudsters'
train.drop(train[train['longestKill'] >= 1000].index, inplace=True)

## Road Kill

In [None]:
show_distplot('roadKills')

In [None]:
# Drop RoadKill 'fraudsters'
train.drop(train[train['roadKills'] >= 10].index, inplace=True)

## Walk Distance

In [None]:
show_distplot('walkDistance')

In [None]:
# Drop walkdistance 'fraudsters'
train.drop(train[(train['walkDistance'] >= 13000) & (train['kills'] == 0)].index, inplace=True)
# Drop ridedistance 'fraudsters'
train.drop(train[(train['rideDistance'] >= 30000) & (train['kills'] == 0)].index, inplace=True)
# Drop swimdistance 'fraudsters'
train.drop(train[train['swimDistance'] >= 2000].index, inplace=True)

In [None]:
# How is it even possible that a player is able to ride and kill without walking even a single meter ?
train[(train['walkDistance'] == 0) & (train['rideDistance'] > 0) & (train['kills'] > 0)]
# What was the player doing in the game when total distance travelled by him/her is 0? 
train[(train['totalDistance'] == 0)]
#Drop fraudsters
train.drop(train[(train['walkDistance'] == 0) & (train['rideDistance'] > 0) & (train['kills'] > 0)].index, inplace = True)
train.drop(train[(train['totalDistance'] == 0)].index, inplace=True)

## Weapons Acquired

In [None]:
train[train['weaponsAcquired'] >= 80].head()

In [None]:
# Remove outliers
train.drop(train[train['weaponsAcquired'] >= 80].index, inplace=True)

## Heals

In [None]:
show_distplot('heals')

In [None]:
# 40 or more healing items used
train[train['heals'] >= 40].head()

In [None]:
# Remove outliers
train.drop(train[train['heals'] >= 40].index, inplace=True)

In [None]:
#Cleaned Data Shape
train.shape

In [None]:
f,ax = plt.subplots(figsize=(20, 15))
sns.heatmap(train.corr(), annot=True, fmt= '.1f',cmap="BrBG")
plt.show()

In [None]:
#Remove features with zero correlation
train.drop(['killPoints','matchDuration','maxPlace','numGroups','rankPoints','roadKills','teamKills','winPoints','vehicleDestroys','headshot_rate','swimDistance'], axis=1, inplace=True)

In [None]:
train.shape

In [None]:
train.head()

## Final Prediction

In [None]:
X_train = train[train['winPlacePerc'].notnull()].reset_index(drop=True)
X_test = train[train['winPlacePerc'].isnull()].drop(['winPlacePerc'], axis=1).reset_index(drop=True)

Y_train = X_train.pop('winPlacePerc')
X_test_grp = X_test[['matchId','groupId']].copy()
train_matchId = X_train['matchId']

# drop matchId,groupId
X_train.drop(['matchId','groupId','Id'], axis=1, inplace=True)
X_test.drop(['matchId','groupId','Id'], axis=1, inplace=True)

print(X_train.shape, X_test.shape)

## Categorial Variables

In [None]:
train.drop(columns=['groupId', 'matchId','Id'], inplace=True)

## Convert categorical Variables to float

In [None]:
# One hot encode matchType
train = pd.get_dummies(train, columns=['matchType'])
# Take a look at the encoding
matchType_encoding = train.filter(regex='matchType')
matchType_encoding

In [None]:
train.head()

## Sampling for ML Algorithm 

In [None]:
sample = 500000
df_sample = train.sample(sample)

In [None]:
# Split sample into training data and target variable
df = df_sample.drop(columns = ['winPlacePerc']) #all columns except target
y = df_sample['winPlacePerc'] # Only target variable

In [None]:
# Function for splitting training and validation data
def split_vals(a, n : int): 
    return a[:n].copy(), a[n:].copy()
val_perc = 0.12 # % to use for validation set
n_valid = int(val_perc * sample) 
n_trn = len(df)-n_valid
# Split data
raw_train, raw_valid = split_vals(df_sample, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

# Check dimensions of samples
print('Sample train shape: ', X_train.shape, 
      'Sample target shape: ', y_train.shape, 
      'Sample validation shape: ', X_valid.shape)

In [None]:
# Metric used for the PUBG competition (Mean Absolute Error (MAE))
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# Function to print the MAE (Mean Absolute Error) score

def print_score(m : RandomForestRegressor):
    res = ['mae train: ', mean_absolute_error(m.predict(X_train), y_train), 
           'mae val: ', mean_absolute_error(m.predict(X_valid), y_valid)]
    #Score of the training dataset obtained using an out-of-bag estimate.
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [None]:
# Train basic model
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
m1 = RandomForestRegressor(n_estimators=50, min_samples_leaf=3, max_features='sqrt', n_jobs=-1)
m1.fit(X_train, y_train)
print_score(m1)

In [None]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)

In [None]:
# What are the most predictive features according to our basic random forest model
fi = rf_feat_importance(m1, df); fi[:15]
# Plot a feature importance graph for the 20 most important features
plot1 = fi[:15].plot('cols', 'imp', figsize=(14,6), legend=False, kind = 'barh')
plot1

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
# Keep only significant features
to_keep = fi[fi.imp>0.005].cols
print('Significant features: ', len(to_keep))
to_keep

In [None]:
# Make a DataFrame with only significant features
df_keep = df[to_keep].copy()
X_train, X_valid = split_vals(df_keep, n_trn)

In [None]:
# Train model on top features
m2 = RandomForestRegressor(n_estimators=80, min_samples_leaf=3, max_features='sqrt', n_jobs=-1)
m2.fit(X_train, y_train)
print_score(m2)

In [None]:
# Get feature importances of our top features
fi_to_keep = rf_feat_importance(m2, df_keep)
plot2 = fi_to_keep.plot('cols', 'imp', figsize=(14,6), legend=False, kind = 'barh')
plot2

In [None]:
# Prepare data
val_perc_full = 0.20 # % to use for validation set
n_valid_full = int(val_perc_full * len(train)) 
n_trn_full = len(train)-n_valid_full
df_full = train.drop(columns = ['winPlacePerc']) # all columns except target
y = train['winPlacePerc'] # target variable
df_full = df_full[to_keep] # Keep only relevant features
X_train, X_valid = split_vals(df_full, n_trn_full)
y_train, y_valid = split_vals(y, n_trn_full)

# Check dimensions of data
print('Sample train shape: ', X_train.shape, 
      'Sample target shape: ', y_train.shape, 
      'Sample validation shape: ', X_valid.shape)

In [None]:
# Train final model
# You should get better results by increasing n_estimators
# and by playing around with the parameters
m3 = RandomForestRegressor(n_estimators=50, min_samples_leaf=3, max_features=0.5,
                          n_jobs=-1)
m3.fit(X_train, y_train)
print_score(m3)

In [None]:
#Adding same features to test data
agg = test.groupby(['groupId']).size().to_frame('players_in_team')
test = test.merge(agg, how='left', on=['groupId'])
test['headshot_rate'] = test['headshotKills'] / test['kills']
test['headshot_rate'] = test['headshot_rate'].fillna(0)
test['totalDistance'] = test['rideDistance'] + test['walkDistance'] + test['swimDistance']

In [None]:
# Turn groupId and match Id into categorical types
test['groupId'] = test['groupId'].astype('category')
test['matchId'] = test['matchId'].astype('category')

# Get category coding for groupId and matchID
test['groupId_cat'] = test['groupId'].cat.codes
test['matchId_cat'] = test['matchId'].cat.codes
#onehot encoding
test = pd.get_dummies(test, columns=['matchType'])
# Take a look at the encoding
matchType_encoding1 = test.filter(regex='matchType')
matchType_encoding1.head()

In [None]:
# # Remove irrelevant features from the test set
test_pred = test[to_keep].copy()

# Fill NaN with 0 (temporary)
test_pred.fillna(0, inplace=True)
test_pred.head()

In [None]:
predictions = np.clip(a = m3.predict(test_pred), a_min = 0.0, a_max = 1.0)
pred_df = pd.DataFrame({'Id' : test['Id'], 'winPlacePerc' : predictions})
pred_df

In [None]:
# Create submission file
pred_df.to_csv("submission.csv", index=False)