In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [None]:
train_data_filepath = '../input/pubg-finish-placement-prediction/train_V2.csv'
train_data = pd.read_csv(train_data_filepath)
train_data = reduce_mem_usage(train_data)

In [None]:
test_data_filepath = '../input/pubg-finish-placement-prediction/test_V2.csv'
test_data = pd.read_csv(test_data_filepath)
test_data = reduce_mem_usage(test_data)

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.head()

In [None]:
test_data.head(5)

In [None]:
train_data.describe()

In [None]:
train_data.dtypes

In [None]:
train_data.isna().any()

In [None]:
train_data['Id'].nunique()

In [None]:
train_data['groupId'].nunique()

In [None]:
train_data['matchId'].nunique()

In [None]:
#Match Type : There are 3 Game Modes in the Game - Solo,Duo,Squad

In [None]:
train_data["matchType"]

In [None]:
train_data.groupby(["matchType"]).count()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
train_data.groupby('matchId')['matchType'].first().value_counts().plot.bar()

In [None]:
#Mapping

In [None]:
new_train_data = train_data
def mapthematch(data):
    mapping = lambda y:'solo' if ('solo' in y) else 'duo' if ('duo' in y) or ('crash' in y) else 'squad'
    data['matchType'] = data['matchType'].apply(mapping)
    return(new_train_data)
data = mapthematch(new_train_data)

data.groupby('matchId')['matchType'].first().value_counts().plot.bar()

In [None]:
#Finding a cheating match

In [None]:
data[data['winPlacePerc'].isnull()]

In [None]:
data.drop(2744604,inplace = True)
data[data['winPlacePerc'].isnull()]

In [None]:
data['matchType'].hist(bins=20)

In [None]:
data['matchDuration'].hist(bins=50)

In [None]:
#Minimum Match Duration

data['matchDuration'].min()

In [None]:
#Maximum Match Duration

data['matchDuration'].max()

In [None]:
#Normalizing the data

#Normalizing the Kills Column

data['killsNormalization'] = data['kills']*((100-data['kills'])/100 +1)

In [None]:
#Normalizing the Damage Dealt Column

data['damageDealtNormalization'] = data['damageDealt']*((100-data['damageDealt'])/100+1)

In [None]:
#Normalizing the MAX Place column

data['maxPlaceNormalization'] = data['maxPlace']*((100-data['maxPlace'])/100+1)

In [None]:
#Normalizing the Match Duration Column

data['matchDurationNormalization'] = data['matchDuration']*((100-data['matchDuration'])/100+1)

In [None]:
new_normalized_column = data[['Id','matchDuration','matchDurationNormalization','maxPlace','maxPlaceNormalization','kills','killsNormalization','damageDealt','damageDealtNormalization']]

In [None]:
new_normalized_column

In [None]:
#Total Distance Travelled

data['totalDistancetravelled'] = data['rideDistance'] + data['walkDistance'] + data['swimDistance']
data['totalDistancetravelled']

In [None]:
# Head Shot Feature

data['headshot_rate'] = data['headshotKills']/data['kills']
data['headshot_rate']

In [None]:
data

In [None]:
data['killswithoutMovinganytime'] = ((data['kills'] >0) & (data['totalDistancetravelled']==0))
data['killswithoutMovinganytime']

In [None]:
data[data['killswithoutMovinganytime']==True].shape

In [None]:
data[data['killswithoutMovinganytime']==True].head(5)

In [None]:
#Remove Outliers
data.drop(data[data['killswithoutMovinganytime']==True].index,inplace=True)

In [None]:
#Visualizing Longest Kill and try to find out what are the Outlier we have

In [None]:
import seaborn as sn

plt.figure(figsize=(14,8))
sn.distplot(data['longestKill'])
plt.show()

In [None]:
display(data[data['longestKill']>= 900].shape)
data[data['longestKill']>= 900].head(10)

In [None]:
#Removing Outliers
data.drop(data[data['longestKill']>= 900].index,inplace=True)

In [None]:
data.shape

In [None]:
data['winPlacePerc']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
data.columns

In [None]:
#thought of using damageDealtNormalization but it seems like it contains null values, So working these features
x = data[['Id','killsNormalization','maxPlaceNormalization','matchDurationNormalization','totalDistancetravelled']]
y = data['winPlacePerc']

In [None]:
train_X , test_X , train_y , test_y = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
model = LinearRegression()
model.fit(train_X,train_y)

In [None]:
model.score(test_X,test_y)

In [None]:
predicted_vals = model.predict(test_X)

In [None]:
predicted_vals

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error
mae(predicted_vals,test_y)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
neigh = KNeighborsRegressor(n_neighbors=30)
neigh.fit(train_X,train_y)

In [None]:
neigh.score(test_X,test_y)

In [None]:
pred_vals = neigh.predict(test_X)

In [None]:
mae(pred_vals,test_y)

In [None]:
output = pd.DataFrame({'Id' : test_y,'winPlacePerc' : pred_vals})
output

In [None]:
from sklearn import metrics
print('Mean Absolute Error:' , metrics.mean_absolute_error(pred_vals,test_y))
print('Mean Squared Error:' , metrics.mean_squared_error(pred_vals,test_y))

In [None]:
output.to_csv('submission.csv',index=False)