In [None]:
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import cufflinks as cf
import plotly

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('../input/train_V2.csv')
print(train.shape)
train.head()

In [None]:
train.info()

In [None]:
train['winPlacePerc'].describe()

In [None]:
test = pd.read_csv('../input/test_V2.csv')
print(test.shape)
test.head()

# EDA

### Check the Any missing data

In [None]:
train.isnull().sum().sum()

In [None]:
test.isnull().sum().sum()

In [None]:
train.winPlacePerc.fillna(1,inplace=True)
train.loc[train['winPlacePerc'].isnull()]

In [None]:

train["distance"] = train["rideDistance"]+train["walkDistance"]+train["swimDistance"]
train["skill"] = train["headshotKills"]+train["roadKills"]
train.drop(['rideDistance','walkDistance','swimDistance','headshotKills','roadKills'],inplace=True,axis=1)
print(train.shape)
train.head()

In [None]:
test["distance"] = test["rideDistance"]+test["walkDistance"]+test["swimDistance"]
test["skill"] = test["headshotKills"]+test["roadKills"]
test.drop(['rideDistance','walkDistance','swimDistance','headshotKills','roadKills'],inplace=True,axis=1)
print(test.shape)
test.head()

### Check the Coorelation

In [None]:
corrmat = train.corr() 
cols = corrmat.nlargest(26, 'winPlacePerc').index # nlargest : Return this many descending sorted values
cm = np.corrcoef(train[cols].values.T)

# correlation 
sns.set(font_scale=1.25)
f, ax = plt.subplots(figsize=(15, 12))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 8}, 
                 yticklabels=cols.values, xticklabels=cols.values)
plt.show()

## Get Train and test

In [None]:
predictors = [ "kills",
                "maxPlace",
                "numGroups",
                "distance",
                "boosts",
                "heals",
                "revives",
                "killStreaks",
                "weaponsAcquired",
                "winPoints",
                "skill",
                "assists",
                "damageDealt",
                "DBNOs",
                "killPlace",
                "killPoints",
                "vehicleDestroys",
                "longestKill"
               ]
print(len(predictors))

In [None]:
X_train = train[predictors]
X_train.head()

In [None]:
y_train = train['winPlacePerc']
y_train.head()

## Build Model

In [None]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [None]:
lgb_reg = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 20, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.8,metric='mse')
lgb_reg.fit(X_train, y_train)

In [None]:
X_test = test[predictors]
X_test.head()

### Predict model

In [None]:
y_pred = lgb_reg.predict(X_test)
y_pred

In [None]:
len(y_pred[y_pred > 1])

In [None]:
y_pred[y_pred > 1] = 1

In [None]:
len(y_pred[y_pred > 1])

In [None]:
ss = ShuffleSplit(n_splits=10)
scores = cross_val_score(lgb_reg, X_train, y_train, cv=ss)
print(scores)

In [None]:
accuracy = scores.mean()
print(accuracy)

## Features Important

In [None]:
lgb.plot_importance(lgb_reg, max_num_features=20, figsize=(12, 10),xlabel='Features Importance',ylabel='Features')
plt.title('Feature importance')

### Submit the file

In [None]:
test_id = test["Id"]
submit = pd.DataFrame({'Id': test_id, "winPlacePerc": y_pred} , columns=['Id', 'winPlacePerc'])
print(submit.head())


In [None]:
submit.to_csv("submission.csv", index = False)