In [None]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**1. Read data **

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
print('train')
train = import_data('../input/train_V2.csv')
print('test')
test = import_data('../input/test_V2.csv')

Importing Libraries 

In [None]:
# Data manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
# Set a few plotting defaults
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 18
plt.rcParams['patch.edgecolor'] = 'k'
from scipy.stats import spearmanr
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from IPython.display import display

# 1. a ) **Exploratory data Analysis.**

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

All seem to be float 64 or int

In [None]:
train.isnull().sum().sum()

In [None]:
test.isnull().sum()

Looks like data needs no cleaning

We combine both test and train into one data frame and we will separate later with null values of winPlacePercentiles or by keys "train'" and "test"

In [None]:
final=pd.concat([train,test],keys=['Train','Test'])

**Starting with visual analysis of data**

We first start by creating buckets of percentiles. 

In [None]:
final.loc['Train'].winPlacePerc.describe()

In [None]:
def find_bucket(x):
    if x>0.9 and x<=1.0:
        return "1"
    if x>0.8 and x<=0.9:
        return "2"
    if x>0.7 and x<=0.8:
        return "3"
    if x>0.6 and x<=0.7:
        return "4"
    if x>0.5 and x<=0.6:
        return "5"
    if x>0.4 and x<=0.5:
        return "6"
    if x>0.3 and x<=0.4:
        return "7"
    if x>0.2 and x<=0.3:
        return "8"
    if x>0.1 and x<=0.2:
        return "9"
    if x>0 and x<=0.1:
        return "10"
    else:
        return '0'
    

In [None]:
seri=final.loc['Train'].winPlacePerc.apply(lambda x:find_bucket(x))

In [None]:
final.loc[:,'Buckets']=seri 
final.loc['Train'].loc[:,'Buckets']=seri


In [None]:
final.head()

In [None]:
cols=[i for i in range(0,11)]
bucket_mapping={"1":"0.9-1.0","2":"0.8-0.9","3":"0.7-0.8",
                "4":"0.6-0.7","5":"0.5-0.6","6":"0.4-0.5",
                "7":"0.3-0.4","8":"0.2-0.3","9":"0.1-0.2",
                "10":"0.0-0.1","0":"O (ZERO)"}
                
final.loc['Train']["Buckets"].value_counts().plot.bar(figsize=(8,6),edgecolor='k',linewidth=2)
plt.xticks(cols,bucket_mapping.values(), rotation=60)
plt.ylabel("Number of people")
plt.xlabel("Bucket category")
plt.title("Number of people V/S Percentile they are in")

In [None]:
final_2=final.copy()

In [None]:
rem_grp_by_groupId=['Buckets']
id_holder_train=final_2.loc['Train'].Id
matchid_holder_train=final_2.loc['Train'].matchId
Bucket_holder_train=final_2.loc['Train'].Buckets

id_holder_test=final_2.loc['Test'].Id
matchid_holder_test=final_2.loc['Test'].matchId
Bucket_holder_test=final_2.loc['Test'].Buckets
final_2=final_2.drop(rem_grp_by_groupId,axis=1)


In [None]:
final_2.loc['Test'].shape#1867913

In [None]:
train=final_2.loc['Train']
test=final_2.loc['Test']

In [None]:

#headshot kills

train.loc[:,'perc_kill_headshot']=train['headshotKills']/train['kills']
test.loc[:,'perc_kill_headshot']=test['headshotKills']/test['kills']

                                                              
#assist kill


train.loc[:,'perc_kill_assist']=train['assists']/train['kills']
test.loc[:,'perc_kill_assist']=test['assists']/test['kills']


#road kills

train.loc[:,'perc_kill_road']=train['roadKills']/train['kills']
test.loc[:,'perc_kill_road']=test['roadKills']/test['kills']

# percent of total distance on ride

train.loc[:,'perc_dist_ride']=train['rideDistance']/(train['swimDistance']+train['rideDistance']+train['walkDistance'])
test.loc[:,'perc_dist_ride']=test['rideDistance']/(test['swimDistance']+test['rideDistance']+test['walkDistance'])


#percent swim

train.loc[:,'perc_dist_swim']=train['swimDistance']/(train['swimDistance']+train['rideDistance']+train['walkDistance'])
test.loc[:,'perc_dist_swim']=test['swimDistance']/(test['swimDistance']+test['rideDistance']+test['walkDistance'])

#percent walk

train.loc[:,'perc_dist_walk']=train['walkDistance']/(train['swimDistance']+train['rideDistance']+train['walkDistance'])
test.loc[:,'perc_dist_walk']=test['walkDistance']/(test['swimDistance']+test['rideDistance']+test['walkDistance'])

#final_2=final_2.fillna(0)


In [None]:
from sklearn.preprocessing import LabelEncoder

#Auto encodes any dataframe column of type category or object.
def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df
train=dummyEncode(train)
test=dummyEncode(test)

In [None]:
train=train.fillna(0)
test=test.fillna(0)

In [None]:
train.head()
#train=train.fillna(0)
#test=test.fillna(0)


In [None]:
def aggregatorF(final_2,groups):
    # Define custom function
    range_ = lambda x: x.max() - x.min()
    range_.__name__ = 'range_'
    # Group and aggregate
    agg = final_2.groupby(groups).agg(['min', 'max', 'size'])
    agg.head().describe()
    return agg

In [None]:
gby_gid=['groupId']
aag_train=aggregatorF(train,gby_gid)


In [None]:
new_cols=[]
for c in aag_train.columns.levels[0]:
    for stat in aag_train.columns.levels[1]:
        new_cols.append(f'{c}-{stat}')
aag_train.columns=new_cols
aag_train.head().describe()

In [None]:
aag_train=aag_train.reset_index()

In [None]:
train2=train.merge(aag_train,on='groupId',how='left')

In [None]:
gby_gid=['groupId']
aag_test=aggregatorF(test,gby_gid)

In [None]:
new_cols=[]
for c in aag_test.columns.levels[0]:
    for stat in aag_test.columns.levels[1]:
        new_cols.append(f'{c}-{stat}')
aag_test.columns=new_cols
aag_test.head().describe()

In [None]:
aag_test=aag_test.reset_index()

In [None]:
test2=test.merge(aag_test,on='groupId',how='left')

In [None]:
train2.head()

In [None]:
to_remove=['winPlacePerc-size','winPlacePerc-min','winPlacePerc-max','Id-size','Id-min','Id-max','matchId-size','matchId-min','matchId-max']
train2=train2.drop(to_remove,axis=1)
test2=test2.drop(to_remove,axis=1)

In [None]:
train2.groupId


In [None]:
'''id_holder_train=final_2.loc['Train'].Id
matchid_holder_train=final_2.loc['Train'].matchId
Bucket_holder_train=final_2.loc['Train'].Buckets

id_holder_test=final_2.loc['Test'].Id
matchid_holder_test=final_2.loc['Test'].matchId
Bucket_holder_test=final_2.loc['Test'].Buckets'''
groupId_holder_train=train2.groupId
groupId_holder_test=test2.groupId
train2=train2.drop(['Id','matchId','groupId'],axis=1)
test2=test2.drop(['Id','matchId','groupId'],axis=1)

Starting with z scores now

In [None]:
train2=train2.fillna(0)
test2=test2.fillna(0)

In [None]:
train2.head()

**Selectors and corr mapping**

In [None]:


ind=["DBNOs","assists","boosts","damageDealt","headshotKills","heals","killPoints","killStreaks","kills","rideDistance","roadKills","swimDistance","vehicleDestroys","walkDistance",
     "weaponsAcquired","winPoints","perc_dist_ride","perc_dist_swim","perc_dist_walk",'longestKill','perc_kill_headshot','perc_kill_road','perc_kill_assist']


ind2=["DBNOs","assists","boosts","damageDealt","headshotKills","heals","killPoints","killStreaks","kills","rideDistance","roadKills","swimDistance","vehicleDestroys","walkDistance",
     "weaponsAcquired","winPoints","perc_dist_ride","killPlace","revives","teamKills","numGroups","maxPlace","perc_dist_swim","perc_dist_walk",'longestKill','perc_kill_headshot','perc_kill_road','perc_kill_assist']

match=["killPlace","matchId","revives","teamKills","numGroups","maxPlace"]


dict_means_train={}
dict_std_train={}
dict_means_test={}
dict_std_test={}
for i in ind2:
    dict_means_train[i]=np.mean(train2[i])
    dict_std_train[i]=np.std(train2[i])
    dict_means_test[i]=np.mean(test2[i])
    dict_std_test[i]=np.std(test2[i])

   
for i in dict_means_train.keys():
    print(i)
    train2[i]=train2[i]-dict_means_train[i]
    train2[i]=train2[i]/dict_std_train[i]
for i in dict_means_test.keys():
    print(i)
    test2[i]=test2[i]-dict_means_test[i]
    test2[i]=test2[i]/dict_std_test[i]
train2.head()    
    

In [None]:
train2=train2.fillna(0)
test2=test2.fillna(0)

Now we will find correlation and eliminate correlation>0.95

In [None]:
train_corr_matrix=train2.corr()
test_corr_matrix=test2.corr()

In [None]:
upper=train_corr_matrix.where(np.triu(np.ones(train_corr_matrix.shape),k=1).astype(np.bool))
to_drop=[column for column in upper.columns if any(upper[column]>0.95)]
print("features before correlation check in train {}".format(len(train2.columns)))
train2=train2.drop(to_drop,axis=1)
print("features after correlation check in train{}".format(len(train2.columns)))

In [None]:
test2=test2.drop(to_drop,axis=1)

**Now we delete rows with >3 and <-3 values from only* train data***

In [None]:
train3=train2.copy()
train3.head()

In [None]:
#train2=train3.copy()

In [None]:
index1=[]
index2=[]
ind2=["DBNOs","assists","boosts","damageDealt","headshotKills","heals","killPoints","killStreaks","kills","rideDistance","roadKills","swimDistance","vehicleDestroys","walkDistance",
     "weaponsAcquired","winPoints","perc_dist_ride","killPlace","revives","teamKills","maxPlace","perc_dist_swim","perc_dist_walk",'longestKill','perc_kill_headshot','perc_kill_road','perc_kill_assist']

for i in ind2:
    print(i)
    train2=train2.drop(train2[train2[i]>3].index,axis=0)
    
print("Completed for >3")    


In [None]:
for i in ['killPoints','winPoints']:
    print(i)
    train2=train2.drop(train2[train2[i]>3].index,axis=0)
print('completed for <-3')        

In [None]:
print('outliers detected and erased : {}'.format(train3.shape[0]-train2.shape[0]))

In [None]:
train_y=train2.winPlacePerc
train_val_x = train2.sample(frac=0.8)
train_val_y = train2.loc[~train2.index.isin(train_val_x.index)]
train_x=train2.drop(['winPlacePerc'],axis=1)

In [None]:
params = {'boosting_type': 'gbdt', 
                  'colsample_bytree': 1, 
                  'learning_rate': 0.028, 
                   'min_child_samples': 10, 
                   'num_leaves': 36, 'reg_alpha': 0.76, 
                   'reg_lambda': 0.43, 
                   'subsample_for_bin': 40000, 
                   'subsample': 0.54, 
                   'class_weight': 'balanced'}

model = lgb.LGBMRegressor(**params,n_jobs = -1, n_estimators = 200,random_state = 0)

In [None]:
train_val_y=train_val_x.winPlacePerc
train_val_x=train_val_x.drop(['winPlacePerc'],axis=1)

In [None]:
model.fit(train_x, train_y, early_stopping_rounds = 100,eval_set = [ (train_val_x,train_val_y)],eval_names = ['eval_sets'],verbose = 200)

In [None]:
test2=test2.drop(['winPlacePerc'],axis=1)


In [None]:
predictions=model.predict(test2)

In [None]:
submission = import_data('../input/sample_submission_V2.csv')
submission.Id=id_holder_test
submission.winPlacePerc=predictions

In [None]:
submission.to_csv('submission_lgbm_fe.csv', index=False)