In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train_data_loc = "../input/pubg-finish-placement-prediction/train_V2.csv"
test_data_loc = "../input/pubg-finish-placement-prediction/test_V2.csv"

train_data = pd.read_csv(train_data_loc)
test_data = pd.read_csv(test_data_loc)

In [None]:
train_data.info()        #Looking at the data structure of the features

In [None]:
train_data.head()

In [None]:
train_data.columns

* **groupId** - Integer ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
* **matchId** - Integer ID to identify match. There are no matches that are in both the training and testing set.
* **assists** - Number of enemy players this player damaged that were killed by teammates.
* **boosts** - Number of boost items used.
* **damageDealt** - Total damage dealt. Note: Self inflicted damage is subtracted.
* **DBNOs** - Number of enemy players knocked.
* **headshotKills** - Number of enemy players killed with headshots.
* **heals** - Number of healing items used.
* **killPlace** - Ranking in match of number of enemy players killed.
* **killPoints** - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.
* **kills** - Number of enemy players killed.
* **killStreaks** - Max number of enemy players killed in a short amount of time.
* **longestKill** - Longest distance between player and player killed at time of death. This may be misleading, as downing a - player and driving away may lead to a large longestKill stat.
* **maxPlace** - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
* **numGroups** - Number of groups we have data for in the match.
* **revives** - Number of times this player revived teammates.
* **rideDistance** - Total distance traveled in vehicles measured in meters.
* **roadKills** - Number of kills while in a vehicle.
* **swimDistance** - Total distance traveled by swimming measured in meters.
* **teamKills** - Number of times this player killed a teammate.
* **vehicleDestroys** - Number of vehicles destroyed.
* **walkDistance** - Total distance traveled on foot measured in meters.
* **weaponsAcquired** - Number of weapons picked up.
* **winPoints** - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.)
* **winPlacePerc** - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

In [None]:
train_data.describe()

We have one missing value in 'winPlacePerc' 

In [None]:
train_data.isnull().sum() #

In [None]:
#In our label we have only one missing value so we can drop it
train_data = train_data.dropna()

In [None]:
train_data.isnull().sum()

In [None]:
train_data.describe()

Now we have equal number of value 

Now let's take a look at the categorical data

In [None]:
len(train_data['matchType'].unique())

High Cardinally data
We have 16 categories, so applying one hot encoding may create too many coloumns which will lead to the 'cure of dimentionality'
Let's see the values and total number of each category****

In [None]:
cat_data = train_data['matchType']
cat_data.shape

In [None]:
pd.get_dummies(cat_data).shape

So we were right the one hot encoding is adding 16 more columns to the dataset which can deteriorate the model

In [None]:
train_data['matchType'].value_counts()

Applying label encoding will also cause th learning algoreithm to depend on the value of the number assigned to a label rather than the pattern. 
We can do 2 things
1. replace each label with the number of occurance - Frequency encoding
2. Or we can take top 10 categories and form one extra category for the remaining labels

In [None]:
#let's perform frequecy encoding
matchtype_data = train_data['matchType']           #saving the matchtype data for future use

In [None]:
matchtype_enc = train_data['matchType'].value_counts().to_dict()        #converting the labels and their value counts into a dictionary
#it will be easier to map

In [None]:
train_data.matchType = train_data.matchType.map(matchtype_enc)
#replacing the matchtype column with the encoded one

In [None]:
train_data['matchType'].head()

So we have encoded the categorical values into numerical values

In [None]:
train_data.info()

After reading the decription of the following columns, it is clear that they don't give any important information
1. Id
2. groupId
3. matchId

In [None]:
#let's drop these columns 
train_data = train_data.drop(columns = ['Id','groupId','matchId'])

In [None]:
#lets check if there is any correaltion 
#train_data.corr()['winPlacePerc'][:]
#if the data is nonrmally distributed

In [None]:
#train_data.corr(method = 'kendall')['winPlacePerc']
#if the data is not normally distributed

In [None]:
for i in train_data.columns:
    print('{} : {}'.format(i,train_data[i].skew()))

In [None]:
Y = train_data.winPlacePerc
X = train_data.drop(columns='winPlacePerc')

In [None]:
from sklearn.preprocessing import normalize
X = normalize(X)

In [None]:
df_X=pd.DataFrame(data=X[0:,0:],
         index=[i for i in range(X.shape[0])],
            columns=['f'+str(i) for i in range(X.shape[1])])

In [None]:
type(df_X)

In [None]:
df_Y = pd.DataFrame(Y)
type(df_Y)

In [None]:
df_X.head()

In [None]:
df_Y.head()

In [None]:
data_norm = pd.concat([df_X,df_Y], axis = 1, sort = False)

In [None]:
data_norm = data_norm.rename(columns={0:"label"})
data_norm.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_X,df_Y,test_size = 0.2, random_state = 42)


In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train,Y_train)
Y_pred = reg.predict(X_test)

In [None]:
Y_test

In [None]:
Y_pred