In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, kstest
from scipy.stats import kurtosis, skew
from scipy import stats

In [None]:
train = pd.read_csv('../input/pubg-finish-placement-prediction/train_V2.csv')
test = pd.read_csv('../input/pubg-finish-placement-prediction/test_V2.csv')

In [None]:
pd.set_option('display.max_columns', None)
train.head()

## Descriptive analysis

In [None]:
data = train.copy()
data.loc[data['kills'] > data['kills'].quantile(0.99)] = '8+'
plt.figure(figsize=(15,10))
sns.countplot(data['kills'].astype('str').sort_values())
plt.title("Kill Count",fontsize=15)
plt.show()

In [None]:
data_types = train[train['matchType'].isin(['duo','duo-fpp','solo','solo-fpp','squad','squad-fpp'])]
plt.figure(figsize=(15,10))
sns.countplot(data_types['matchType'].astype('str').sort_values())
plt.title("Match Types",fontsize=15)
plt.show()

In [None]:
kills = train['kills']

print('Mode:', kills.mode()[0])
print('Median:', kills.median())
print('Mean:', kills.mean())
print('Range:', kills.max() - kills.min())
print('S.E. mean:', kills.std() / np.sqrt(kills.count()))
IQR = kills.quantile(0.75)-kills.quantile(0.25)
print('IQR:', IQR)
print('IQR deviation:', IQR / 2)

kills_non_zero = kills[kills > 0]
print('Non-zero decile ratio:', kills_non_zero.quantile(0.9) / kills_non_zero.quantile(0.1))

In [None]:
plt.figure(figsize=(15,10))
plt.title("Walking Distance Distribution",fontsize=15)
walkdistance = train['walkDistance']
sns.distplot(walkdistance)
plt.show()

In [None]:
loc, scale = norm.fit(walkdistance)
# create a normal distribution with loc and scale
n = norm(loc=loc, scale=scale)
kstest(walkdistance, n.cdf)

p-value < 0.05 => not normally distributed

In [None]:
print('kurtosis of distribution: {}'.format(kurtosis(walkdistance)))
print('skewness of distribution: {}'.format(skew(walkdistance)))

Kurtosis < 3 => platicurtic <br>
Skewness > 1 => positively skewed

In [None]:
plt.figure(figsize=(15,10))
plt.title("Damage Dealt",fontsize=15)
damageDealt = train['damageDealt']
sns.distplot(damageDealt)
plt.show()

In [None]:
loc, scale = norm.fit(damageDealt)
# create a normal distribution with loc and scale
n = norm(loc=loc, scale=scale)
kstest(damageDealt, n.cdf)

p-value < 0.05 => not normally distributed

In [None]:
print('kurtosis of distribution: {}'.format(kurtosis(damageDealt)))
print('skewness of distribution: {}'.format(skew(damageDealt)))

Kurtosis > 3 => leptocurtic <br>
Skewness > 1 => positively skewed

## Relations

In [None]:
f,ax = plt.subplots(figsize=(15, 15))
sns.heatmap(train.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

In [None]:
killsPlace = train['killPlace']

plt.figure(figsize=(15,10))
plt.scatter(damageDealt,walkdistance,  c = 'green', edgecolor = 'none', marker = '.')
plt.xlabel('kills place')
plt.ylabel('walk distance')
plt.title('Correlation between kills place and walk distance')
plt.show()

In [None]:
weaponsAcquired  = train['weaponsAcquired']

plt.figure(figsize=(15,10))
plt.scatter(weaponsAcquired ,kills,  c = 'blue', edgecolor = 'none', marker = '.')
plt.xlabel('weaponsAcquired')
plt.ylabel('kills')
plt.title('Correlation between weapons acquired and kills')
plt.show()

## Tests

In [None]:
killers = train[train['kills']>0]
non_killers = train[train['kills']==0]

Mann-Whitney U-test (since parameters are not normally distributed) and independent test (as we use different groups during our comparison). <br>
H0: the mean of variables of killers and non-killers groups are same. <br>
H1: the mean of variables of killers and non-killers are different.


In [None]:
stats.mannwhitneyu(killers['walkDistance'].dropna(),non_killers['walkDistance'].dropna())

p-value < 0.05 => mean of walk distance of killers and non-killers are different

In [None]:
stats.mannwhitneyu(killers['matchDuration'].dropna(),non_killers['matchDuration'].dropna())

p-value < 0.05 => mean of match duration of killers and non-killers are different

## Cluster Analysis

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
columns = train[['kills','walkDistance','swimDistance','weaponsAcquired','assists']]
columns = columns[:10000]
#columns = columns.fillna(0)
columns

from sklearn import preprocessing

x = columns.values #returns a numpy array
scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
columns_std = pd.DataFrame(x_scaled)
columns_std = columns_std.fillna(0)
columns_std

In [None]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram

model = AgglomerativeClustering(n_clusters=None, distance_threshold=0)
model = model.fit(columns_std)

plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode='level', p=2)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [None]:
model_4 = AgglomerativeClustering(n_clusters=4)
model_4 = model_4.fit_predict(columns_std)
model_4

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
reduced = pca.fit_transform(columns_std)
columns_pca = pd.DataFrame(reduced)
columns_pca

In [None]:
columns_pca['label'] = model_4
columns_pca['label'].value_counts()

In [None]:
u_labels = np.unique(model_4)

from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(15,10))
ax = Axes3D(fig)
#plotting the results:
 
for i in u_labels:
    ax.scatter(columns_pca[columns_pca['label'] == i][0], columns_pca[columns_pca['label'] == i][1], columns_pca[columns_pca['label'] == i][2])
    #plt.scatter(columns_pca[columns_pca['label'] == i][0] , columns_pca[columns_pca['label'] == i][1] , label = i)

plt.legend()
plt.show()



In [None]:
for i in u_labels:
    print('Group # ', i)
    print(columns[columns_pca['label']==i].describe())
    print('\n')

Group 0: Killers & looters <br>
Group 1: Assistants <br>
Group 2: Swimmers <br>
Group 3: Losers <br>

## Regression model

In [None]:
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

train.dropna(subset=["winPlacePerc"], inplace=True) # droping rows with missing labels
train10k = train[:100000]
X = train10k.drop(["Id","groupId","matchId","matchType","winPlacePerc"], axis=1)
y = train10k["winPlacePerc"]

col_names = X.columns
transformer = Normalizer().fit(X)
X = transformer.transform(X)

In [None]:
X = pd.DataFrame(X, columns=col_names)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

D_train = xgb.DMatrix(X_train, label=Y_train)
D_test = xgb.DMatrix(X_test, label=Y_test)

In [None]:
param = {
    'eta': 0.15, 
    'max_depth': 5,  
    'num_class': 2} 

steps = 20  # The number of training iterations
model = xgb.train(param, D_train, steps)

In [None]:
fig, ax1 = plt.subplots(figsize=(8,15))
xgb.plot_importance(model, ax=ax1)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("MSE = {}".format(mean_squared_error(Y_test, best_preds)))