### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Importing the dataset

In [2]:
data = pd.read_csv("pubg.csv")

In [3]:
# calling the data to do all the basic checks
data.head(8)

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875
5,ff79c12f326506,289a6836a88d27,bac52627a12114,0,0,100.0,1,1,0,44,...,0,0.0,0,0.0,0,0,34.7,1,0,0.037
6,95959be0e21ca3,2c485a1ad3d0f1,a8274e903927a2,0,0,0.0,0,0,0,96,...,0,0.0,0,0.0,0,0,13.5,1,1497,0.0
7,311b84c6ff4390,eaba5fcb7fc1ae,292611730ca862,0,0,8.538,0,0,0,48,...,0,2004.0,0,0.0,0,0,1089.0,6,1500,0.7368


In this dataset all the variables except "winPlacePerc" is independent variable, and the column "winPlacePerc" is target variable.

### Domain Analysis

#### Independent Variables
Id<br>
groupId<br>
matchId<br>
assists<br>
boosts<br>
damageDealt<br>
DBNOs<br>
headshotKills<br>
heals<br>
killPlace<br>
killPoints<br>
kills<br>
killStreaks<br>
longestKill<br>
matchDuration<br>
matchType<br>
maxPlace<br>
maxPlace<br>
rankPoints<br>
revives<br>
rideDistance<br>
roadKills<br>
swimDistance<br>
teamKills<br>
vehicleDestroys<br>
walkDistance<br>
weaponsAcquired<br>
winPoints<br>
#### Dependent Variable
winPlacePerc

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Id               object 
 1   groupId          object 
 2   matchId          object 
 3   assists          int64  
 4   boosts           int64  
 5   damageDealt      float64
 6   DBNOs            int64  
 7   headshotKills    int64  
 8   heals            int64  
 9   killPlace        int64  
 10  killPoints       int64  
 11  kills            int64  
 12  killStreaks      int64  
 13  longestKill      float64
 14  matchDuration    int64  
 15  matchType        object 
 16  maxPlace         int64  
 17  numGroups        int64  
 18  rankPoints       int64  
 19  revives          int64  
 20  rideDistance     float64
 21  roadKills        int64  
 22  swimDistance     float64
 23  teamKills        int64  
 24  vehicleDestroys  int64  
 25  walkDistance     float64
 26  weaponsAcquired  int64  
 27  winPoints   

In [5]:
data.isnull().sum()

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       1
dtype: int64

### Data Preprocessing

In [6]:
data.matchType.unique()

array(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo',
       'normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp',
       'flarefpp', 'normal-duo-fpp', 'normal-duo', 'normal-squad',
       'crashtpp', 'normal-solo'], dtype=object)

In [7]:
## Manual encoding matchType feature
data.matchType=data.matchType.map({'squad-fpp':1, 'duo':2, 'solo-fpp':3, 'squad':4, 'duo-fpp':5, 'solo':6,
       'normal-squad-fpp':7, 'crashfpp':8, 'flaretpp':9, 'normal-solo-fpp':10,
       'flarefpp':11, 'normal-duo-fpp':12, 'normal-duo':13, 'normal-squad':14,
       'crashtpp':15, 'normal-solo':16})
data.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [8]:
# Replace the nan values in winPlacePerc column with mean value
data.loc[data['winPlacePerc'].isnull()==True,'winPlacePerc']=np.mean(data.winPlacePerc.dropna(axis=0))

In [9]:
data.winPlacePerc.isnull().sum()

0

Now there are no null values left

### Exploratory Data Analysis

In [None]:
# Bivariate analysis
sns.histplot(x=data.heals,y=data.winPlacePerc,hue=data.heals)

In [10]:
data.drop(["Id","groupId","matchId"],axis=1,inplace=True)

In [11]:
data.head(8)

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,0,0,0.0,0,0,0,60,1241,0,0,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,0,0,91.47,0,0,0,57,0,0,0,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1,0,68.0,0,0,0,47,0,0,0,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,0,0,32.9,0,0,0,75,0,0,0,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,0,0,100.0,0,0,0,45,0,1,1,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875
5,0,0,100.0,1,1,0,44,0,1,1,...,0,0.0,0,0.0,0,0,34.7,1,0,0.037
6,0,0,0.0,0,0,0,96,1262,0,0,...,0,0.0,0,0.0,0,0,13.5,1,1497,0.0
7,0,0,8.538,0,0,0,48,1000,0,0,...,0,2004.0,0,0.0,0,0,1089.0,6,1500,0.7368


### Scaling the Indipendent variables

In [12]:
data.columns

Index(['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill',
       'matchDuration', 'matchType', 'maxPlace', 'numGroups', 'rankPoints',
       'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints',
       'winPlacePerc'],
      dtype='object')

In [13]:
from sklearn.preprocessing import MinMaxScaler
scale=MinMaxScaler()
data[['assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints']]=scale.fit_transform(data[['assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints']])

In [14]:
data.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,0.0,0.0,0.0,0.0,0.0,0.0,0.59,0.571889,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.009496,0.004237,0.728266,0.4444
1,0.0,0.0,0.013826,0.0,0.0,0.0,0.56,0.0,0.0,0.0,...,0.0,1.10538e-07,0.0,0.002888,0.0,0.0,0.055625,0.021186,0.0,0.64
2,0.045455,0.0,0.010278,0.0,0.0,0.0,0.46,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.006276,0.008475,0.0,0.7755
3,0.0,0.0,0.004973,0.0,0.0,0.0,0.74,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.007863,0.012712,0.0,0.1667
4,0.0,0.0,0.015115,0.0,0.0,0.0,0.44,0.0,0.013889,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00193,0.008475,0.0,0.1875


### Model Creation

In [15]:
# Defining x and y
X=data.loc[:,['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill',
       'matchDuration', 'matchType', 'maxPlace', 'numGroups', 'rankPoints',
       'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints']]
y=data.winPlacePerc

In [16]:
X

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.59,0.571889,0.000000,0.00,...,0.000000,0.000000,0.000000e+00,0.0,0.000000,0.0,0.0,0.009496,0.004237,0.728266
1,0.000000,0.000000,0.013826,0.000000,0.000000,0.0000,0.56,0.000000,0.000000,0.00,...,0.251227,0.000000,1.105380e-07,0.0,0.002888,0.0,0.0,0.055625,0.021186,0.000000
2,0.045455,0.000000,0.010278,0.000000,0.000000,0.0000,0.46,0.000000,0.000000,0.00,...,0.252411,0.000000,0.000000e+00,0.0,0.000000,0.0,0.0,0.006276,0.008475,0.000000
3,0.000000,0.000000,0.004973,0.000000,0.000000,0.0000,0.74,0.000000,0.000000,0.00,...,0.238369,0.000000,0.000000e+00,0.0,0.000000,0.0,0.0,0.007863,0.012712,0.000000
4,0.000000,0.000000,0.015115,0.000000,0.000000,0.0000,0.44,0.000000,0.013889,0.05,...,0.264084,0.000000,0.000000e+00,0.0,0.000000,0.0,0.0,0.001930,0.008475,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446961,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.73,0.474194,0.000000,0.00,...,0.000000,0.000000,3.173667e-02,0.0,0.000000,0.0,0.0,0.039527,0.012712,0.748634
4446962,0.000000,0.030303,0.006673,0.000000,0.000000,0.0000,0.68,0.000000,0.000000,0.00,...,0.254103,0.000000,0.000000e+00,0.0,0.000000,0.0,0.0,0.003169,0.025424,0.000000
4446963,0.000000,0.000000,0.008927,0.000000,0.000000,0.0000,0.65,0.000000,0.000000,0.00,...,0.253933,0.000000,0.000000e+00,0.0,0.000571,0.0,0.0,0.030593,0.016949,0.000000
4446964,0.000000,0.121212,0.027267,0.018868,0.015625,0.0250,0.10,0.000000,0.027778,0.05,...,0.240061,0.051282,0.000000e+00,0.0,0.000000,0.0,0.0,0.106594,0.033898,0.000000


In [17]:
y

0          0.4444
1          0.6400
2          0.7755
3          0.1667
4          0.1875
            ...  
4446961    0.1786
4446962    0.2935
4446963    0.4815
4446964    0.8000
4446965    0.5464
Name: winPlacePerc, Length: 4446966, dtype: float64

In [18]:
# Traning and testing data creation
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=3)

In [19]:
# Using Linear Regression
from sklearn.linear_model import LinearRegression
LR=LinearRegression() # object creation
LR.fit(X_train,y_train) # training of linear regression
y_predict=LR.predict(X_test)

In [20]:
y_test # Actual values

906733     0.3297
258366     0.8191
2129977    0.9615
1145429    0.6129
1748152    0.4255
            ...  
1492072    0.1263
4126802    0.6538
2752871    0.5714
789673     0.3636
2824956    0.8889
Name: winPlacePerc, Length: 889394, dtype: float64

In [21]:
y_predict # Values predicted by model

array([0.3638696 , 0.81503968, 0.89112283, ..., 0.5889755 , 0.34274272,
       0.96843675])

In [22]:
# Model Evalution
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
r2score=r2_score(y_test,y_predict)
r2score

0.8304160805045407

In [23]:
X_train

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints
1581174,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.92,0.491705,0.000000,0.00,...,0.000169,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.750621
630708,0.000000,0.030303,0.017624,0.037736,0.015625,0.0125,0.22,0.000000,0.013889,0.05,...,0.251057,0.000000,0.000000,0.0,0.0,0.0,0.0,0.093561,0.016949,0.000000
3142210,0.045455,0.030303,0.047703,0.037736,0.000000,0.0125,0.17,0.000000,0.027778,0.05,...,0.254779,0.025641,0.000000,0.0,0.0,0.0,0.0,0.018584,0.025424,0.000000
493039,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.42,0.000000,0.000000,0.00,...,0.254779,0.000000,0.098968,0.0,0.0,0.0,0.0,0.062529,0.016949,0.000000
2879459,0.045455,0.000000,0.013006,0.018868,0.000000,0.0000,0.33,0.610599,0.013889,0.05,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.042320,0.025424,0.706408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4088597,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.93,0.000000,0.000000,0.00,...,0.257486,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000998,0.004237,0.000000
1008584,0.000000,0.000000,0.039480,0.018868,0.000000,0.0000,0.58,0.000000,0.000000,0.00,...,0.260870,0.000000,0.000000,0.0,0.0,0.0,0.0,0.060667,0.012712,0.000000
452227,0.000000,0.060606,0.032663,0.018868,0.015625,0.0500,0.33,0.000000,0.013889,0.05,...,0.238877,0.000000,0.000000,0.0,0.0,0.0,0.0,0.047362,0.012712,0.000000
1771160,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.74,0.483410,0.000000,0.00,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.013926,0.008475,0.719324


##### Data Preprocessing for Logistic Regression

In [24]:
y

0          0.4444
1          0.6400
2          0.7755
3          0.1667
4          0.1875
            ...  
4446961    0.1786
4446962    0.2935
4446963    0.4815
4446964    0.8000
4446965    0.5464
Name: winPlacePerc, Length: 4446966, dtype: float64

In [25]:
for i in range(len(y)):
    if y[i]>0.5:
        y[i]=1
    else:
        y[i]=0

In [26]:
for i in range(len(y)):
    y[i]=int(y[i])

In [27]:
type(y)

pandas.core.series.Series

In [28]:
# Traning and testing data creation
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=8)

In [29]:
# Using logistic regression
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [30]:
# Prediction
y_pred=clf.predict(X_test)
y_pred

array([1., 0., 0., ..., 0., 1., 0.])

In [31]:
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,classification_report,f1_score
acc=accuracy_score(y_test,y_pred)
acc*100

91.29969394891353

### Conclusion

In this Machine learning model I have used Linear Regression and Logistic Regression the test accuracy of Linear Regression was 83%<br>
Whereas using Logistic Regression the test accuracy was 91.29% Hence there is difference in both and we can conclude that Logistic Regression performs better for this Machine Learning model