In [None]:
import numpy as np 
import pandas as pd 
import os
print(os.listdir("../input"))

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from scipy import stats
from scipy.stats import norm, skew 

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, SGDRegressor,Ridge
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, LinearSVR

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split,GridSearchCV
from sklearn.metrics import mean_absolute_error

import xgboost as xgb
import lightgbm as lgb

In [None]:
train = pd.read_csv('../input/train.csv')
train.head()

In [None]:
train.tail(2)

In [None]:
train.shape

In [None]:
test = pd.read_csv('../input/test.csv')
test.head()

In [None]:
test.shape

#### target

In [None]:
[i for i in train.columns if i not in test.columns]

In [None]:
train.winPlacePerc.head()

In [None]:
train.winPlacePerc.max(),train.winPlacePerc.min()

In [None]:
Y_train = train['winPlacePerc']
X_train = train.drop('winPlacePerc',1)
data_all = pd.concat([X_train,test],0)

In [None]:
data_all.shape

## EDA

### missing values

In [None]:
data_all.isnull().sum().sort_values(ascending = False).head(2)

#### Great! There is no missing values!

### relationship between features and our target

In [None]:
corrmat = train.corr()
k = 25
plt.subplots(figsize=(20, 20))
cols = corrmat.nlargest(k,'winPlacePerc')['winPlacePerc'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)

hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)

## different game modes
#### noticed by other kernels, there are different modes in this game(roughly  solo,duo,squa, zombie and DIY?)

#### first, team number(numGroups) should tell us sth about modes

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(data_all[['numGroups']])
# sns.distplot(data_all[['maxPlace']])

#### as we can see from the plot, the num of teams not always be 25 for squa, 50 for duo and 100 for solo, there should be three reasons:
####  1. games not always started with 100 players
#### 2. some creazy people will join a squa game(or duo) alone, so sometimes team numbers will more than 50 for duos and 25 for squas
#### 3. zombie mode and sth made by players(DIY)

#### therefore, the total player number of the game(maxPlace) may help us

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(data_all[['maxPlace']])
plt.axvline(50,color = 'r')
plt.axvline(25,color = 'r')

#### Things become much more obvious in this plot, and by the way, we can find that most of the games were not started with 100 players
#### And I noticed that maxPlace is different from numGroups, which makes me confused, as I thought maxPlace should be the number of groups......
#### Anyway, I'll just leave it here

#### The most import thing I will do next is trying to figure out how to split game modes
#### As we can see from the plot, the identities of  most of the games are clear, but for those under the valley, things become tricky(for eample, those with maxPlace between 60 - 80) 
#### I will use matchId to help me, as a solo mode game will not have a team with 2 players as well as duo will not have a team with 4 

In [None]:
most_players_perteam = data_all.groupby(['matchId','groupId'],as_index = False).count()[['matchId','Id']].groupby('matchId').max()
most_players_perteam.sort_values(by='Id',ascending = False)

In [None]:
solo_games_matchId = most_players_perteam[most_players_perteam['Id']==1].index
duo_games_matchId = most_players_perteam[most_players_perteam['Id']==2].index
squa_games_matchId = most_players_perteam[most_players_perteam['Id']==4].index

solo_games = data_all[data_all.matchId.isin(solo_games_matchId)]
duo_games = data_all[data_all.matchId.isin(duo_games_matchId)]
squa_games = data_all[data_all.matchId.isin(squa_games_matchId)]

unknow_games = data_all[~(data_all.matchId.isin(solo_games_matchId) | (data_all.matchId.isin(duo_games_matchId)) | (data_all.matchId.isin(squa_games_matchId)))]

plt.figure(figsize=(16,15))
plt.subplot(311)
sns.distplot(data_all[['numGroups']])
sns.distplot(data_all[['maxPlace']])
plt.title('all')
plt.subplot(323)
sns.distplot(solo_games[['numGroups']])
sns.distplot(solo_games[['maxPlace']])
plt.title('Solo')
plt.subplot(324)
sns.distplot(duo_games[['numGroups']])
sns.distplot(duo_games[['maxPlace']])
plt.title('Duo')
plt.subplot(325)
sns.distplot(squa_games[['numGroups']])
sns.distplot(squa_games[['maxPlace']])
plt.title('Squa')
plt.subplot(326)
sns.distplot(unknow_games[['numGroups']])
sns.distplot(unknow_games[['maxPlace']])
plt.title('Unknow')

## What?????????????????????
#### The results seem rediculous!

#### Let's see some of those games  
#### matchId  7       It has one and only one group with 4 players and others with less than 4

In [None]:
squa_games[(squa_games.matchId == 7)&(squa_games.groupId==2612504)]

#### another one

In [None]:
squa_games[(squa_games.matchId == 190)&(squa_games.groupId.isin([401596,401598,401582]))].sort_values(by='groupId')

#### Data shows they are real players! So it seems even when you join the game with duo mode(or solo?), you may still meet team with 4 players ? Or you 2 just join the game with squa mode and are placed into a game that most of the teams are 2? I am not sure as I'm not a player of this game.....

#### Let's see if we change  max  to  median and mode, what will happend

#### median

In [None]:
median_players_perteam = data_all.groupby(['matchId','groupId'],as_index = False).count()[['matchId','Id']].groupby('matchId').median()
# most_players_perteam.sort_values(by='Id',ascending = False)

In [None]:
solo_games_matchId = median_players_perteam[median_players_perteam['Id']==1].index
duo_games_matchId = median_players_perteam[median_players_perteam['Id']==2].index
squa_games_matchId = median_players_perteam[(median_players_perteam['Id']<=4)&(median_players_perteam['Id']>=3)].index

solo_games = data_all[data_all.matchId.isin(solo_games_matchId)]
duo_games = data_all[data_all.matchId.isin(duo_games_matchId)]
squa_games = data_all[data_all.matchId.isin(squa_games_matchId)]

unknow_games = data_all[~(data_all.matchId.isin(solo_games_matchId) | (data_all.matchId.isin(duo_games_matchId)) | (data_all.matchId.isin(squa_games_matchId)))]

plt.figure(figsize=(16,15))
plt.subplot(311)
sns.distplot(data_all[['numGroups']])
sns.distplot(data_all[['maxPlace']])
plt.title('all')
plt.subplot(323)
sns.distplot(solo_games[['numGroups']])
sns.distplot(solo_games[['maxPlace']])
plt.title('Solo')
plt.subplot(324)
sns.distplot(duo_games[['numGroups']])
sns.distplot(duo_games[['maxPlace']])
plt.title('Duo')
plt.subplot(325)
sns.distplot(squa_games[['numGroups']])
sns.distplot(squa_games[['maxPlace']])
plt.title('Squa')
plt.subplot(326)
sns.distplot(unknow_games[['numGroups']])
sns.distplot(unknow_games[['maxPlace']])
plt.title('Unknow')

#### mode

In [None]:
mode_players_perteam = data_all.groupby(['matchId','groupId'],as_index = False).count()[['matchId','Id']].groupby('matchId').agg(lambda x: np.mean(pd.Series.mode(x)))
# most_players_perteam.sort_values(by='Id',ascending = False)

In [None]:
solo_games_matchId = mode_players_perteam[mode_players_perteam['Id']==1].index
duo_games_matchId = mode_players_perteam[mode_players_perteam['Id']==2].index
squa_games_matchId = mode_players_perteam[(mode_players_perteam['Id']<=4)&(mode_players_perteam['Id']>=3)].index

solo_games = data_all[data_all.matchId.isin(solo_games_matchId)]
duo_games = data_all[data_all.matchId.isin(duo_games_matchId)]
squa_games = data_all[data_all.matchId.isin(squa_games_matchId)]

unknow_games = data_all[~(data_all.matchId.isin(solo_games_matchId) | (data_all.matchId.isin(duo_games_matchId)) | (data_all.matchId.isin(squa_games_matchId)))]

plt.figure(figsize=(16,15))
plt.subplot(311)
sns.distplot(data_all[['numGroups']])
sns.distplot(data_all[['maxPlace']])
plt.title('all')
plt.subplot(323)
sns.distplot(solo_games[['numGroups']])
sns.distplot(solo_games[['maxPlace']])
plt.title('Solo')
plt.subplot(324)
sns.distplot(duo_games[['numGroups']])
sns.distplot(duo_games[['maxPlace']])
plt.title('Duo')
plt.subplot(325)
sns.distplot(squa_games[['numGroups']])
sns.distplot(squa_games[['maxPlace']])
plt.title('Squa')
plt.subplot(326)
sns.distplot(unknow_games[['numGroups']])
sns.distplot(unknow_games[['maxPlace']])
plt.title('Unknow')

#### therefore, median and mode works better if you want to find which kinds of mode the game is
#### but there are still two more kinds of modes in the data, zombie and DIY
#### For zombies, it seems many players will be put into one team(as zombies), so we may split them with max team members
#### For DIYs, things become complicated and I still don't know how to deal with it~

#### (sorry for my poor English...)

#### Here, I will simply put games into 4 kinds, Solo, Duo, Squa, Others(as Zombie is part of DIY mode)

## Feature Engineering

#### copy data_all for feature use

In [None]:
data_all_whole = data_all.copy()

### According to the up explorations, we roughly split games into 4 modes

In [None]:
data_all['modes'] = 'N'
data_all.loc[solo_games.index,['modes']] = 'Solo'
data_all.loc[duo_games.index,['modes']] = 'Duo'
data_all.loc[squa_games.index,['modes']] = 'Squa'
data_all.loc[unknow_games.index,['modes']] = 'Unknow'

In [None]:
data_all.modes.unique()

In [None]:
data_all.drop(['Id','groupId','matchId','numGroups','maxPlace'],1,inplace = True)

In [None]:
data_all = pd.get_dummies(data_all)
data_all.shape

## Modelling

In [None]:
ntrain = train.shape[0]
X_train = data_all[:ntrain]
test = data_all[ntrain:]

In [None]:
#Validation function
n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=2).get_n_splits(X_train)
    
    rmse= np.sqrt(-cross_val_score(model, X_train, Y_train, scoring="neg_mean_absolute_error", cv = kf))
    return(rmse)

In [None]:
lasso = Lasso(alpha=0.01,max_iter=1000)

ENet = ElasticNet(alpha=0.001,max_iter=1000)

KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

ridge = Ridge()

BayesRR = BayesianRidge()

In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = rmsle_cv(ENet)
print("\nElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# score = rmsle_cv(KRR)
# print("\nKRR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = rmsle_cv(BayesRR)
print("\nBayesRR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = rmsle_cv(ridge)
print("\nridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [None]:
score = rmsle_cv(model_lgb)
print("\nmodel_lgb score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))