In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("train_sampled.csv")
train.columns = ['Id','groupId','matchId','assists','boosts','damageDealt','DBNOs','headshotKills','heals','killPlace','killPoints','kills','killStreaks','longestKill','matchDuration','matchType','maxPlace','numGroups','rankPoints','revives','rideDistance','roadKills','swimDistance','teamKills','vehicleDestroys','walkDistance','weaponsAcquired','winPoints','winPlacePerc']


In [3]:
train['playersJoined'] = train.groupby('matchId')['matchId'].transform('count')
train['healsandboosts'] = train['heals'] + train['boosts']
train['totaldistance'] = train['rideDistance'] + train['swimDistance'] + train['walkDistance']
train.describe()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,playersJoined,healsandboosts,totaldistance
count,741160.0,741160.0,741160.0,741160.0,741160.0,741160.0,741160.0,741160.0,741160.0,741160.0,...,741160.0,741160.0,741160.0,741160.0,741160.0,741160.0,741160.0,741160.0,741160.0,741160.0
mean,0.233542,1.107063,130.817929,0.657853,0.227238,1.367286,47.572265,504.310532,0.926086,0.544822,...,4.458336,0.023926,0.008135,1155.690825,3.65737,605.620491,0.473064,16.532446,2.47435,1767.345685
std,0.587491,1.717225,170.40989,1.143437,0.598337,2.677346,27.47453,627.384403,1.55356,0.711089,...,30.060918,0.166768,0.09458,1185.722245,2.445136,739.575678,0.30758,3.76573,3.877863,2187.674725
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,155.0,2.0,0.0,0.2,14.0,0.0,158.0
50%,0.0,0.0,84.54,0.0,0.0,0.0,47.0,0.0,0.0,0.0,...,0.0,0.0,0.0,686.0,3.0,0.0,0.4583,16.0,1.0,789.885
75%,0.0,2.0,185.9,1.0,0.0,2.0,71.0,1172.0,1.0,1.0,...,0.0,0.0,0.0,1979.0,5.0,1495.0,0.7407,19.0,4.0,2728.0
max,15.0,23.0,4014.0,38.0,21.0,80.0,101.0,2170.0,46.0,13.0,...,2148.0,10.0,5.0,13530.0,167.0,2013.0,1.0,32.0,81.0,32553.7


In [4]:
train['killsNorm'] = train['kills']*((100-train['playersJoined'])/100 + 1)
train['damageDealtNorm'] = train['damageDealt']*((100-train['playersJoined'])/100 + 1)
train['maxPlaceNorm'] = train['maxPlace']*((100-train['playersJoined'])/100 + 1)
train['matchDurationNorm'] = train['matchDuration']*((100-train['playersJoined'])/100+1)


In [5]:
train['killswithoutmoving'] = ((train['totaldistance']==0) & (train['kills'] > 0))


In [6]:
print(train[train['killswithoutmoving'] == True].shape)
train.drop(train[train['killswithoutmoving'] == True].index, inplace=True)
print(train[train['kills'] > 40].shape)
print(train[(train['headshotKills']/train['kills'] == 1) & (train['kills'] > 12)].shape)

print(train[(train['totaldistance'] == train['swimDistance'])&(train['winPlacePerc'] > 0.80)].shape)
train.drop(train[(train['totaldistance'] == train['swimDistance'])&(train['winPlacePerc'] > 0.80)].index, inplace=True)
train.drop(train[train['kills'] > 40].index, inplace=True)
train.drop(train[(train['headshotKills']/train['kills'] == 1) & (train['kills'] > 12)].index, inplace=True)

(230, 37)
(1, 37)
(1, 37)
(185, 37)


In [7]:
print(train[train['weaponsAcquired'] > 50].shape)
train.drop(train[train['weaponsAcquired'] > 50].index, inplace=True)
print(train[train['heals'] > 35].shape)
train.drop(train[train['heals'] > 35].index, inplace=True)


(22, 37)
(43, 37)


In [8]:
columns = ['killsNorm', 'damageDealtNorm', 'matchDurationNorm', 'assists', 'healsandboosts', 'DBNOs', 'killPlace', 'walkDistance', 'winPoints'
          ,'weaponsAcquired', 'killStreaks']
X = train[columns]
y = train['winPlacePerc']

train_X, val_X, train_y, val_y = train_test_split(X,y, test_size=0.2, random_state=1)
model = LinearRegression()
model.fit(train_X, train_y)
pred = model.predict(val_X)
print(np.sqrt(metrics.mean_squared_error(val_y, pred)))
print(model.score(val_X, val_y))

0.13221458061523045
0.8151348049170025


In [13]:
train['Solo'] = (train['numGroups'] > 50).astype(np.int8)
train['Duo'] = (train['numGroups'] <= 50 & train['numGroups']).astype(np.int8)
train['Squad'] = (train['numGroups']).astype(np.int8)

In [14]:
train.Duo.describe()

count    740678.000000
mean          0.060950
std           0.239238
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: Duo, dtype: float64