In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Utilities from kaggle kernels
# Instead of data = pd.read_csv("../input/train_V2.csv")
# We use : data = read_fast("../input/train_V2.csv")
import random
import time

def reduce_mem_usage_func(df):
    """ Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
        iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

def get_sampled_data(filename, sample_percent):
    n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
    print('total records in dataset: %s' % n)
    sample_size = int((sample_percent * n) / 100)
    print('will select %s percent (%s) sample records randomly' % (sample_percent, sample_size))
    skip = sorted(random.sample(range(1,n+1),n-sample_size)) #the 0-indexed header will not be included in the skip list
    df = pd.read_csv(filename, skiprows=skip)
    return df


def read_fast(filename, sample=True, sample_percent=20, reduce_mem_usage=True):
    start_time = time.time()
    df = get_sampled_data(filename, sample_percent) if sample else pd.read_csv(filename)
    new_df = reduce_mem_usage_func(df) if reduce_mem_usage else df
    elapsed_time = int(time.time() - start_time)
    print('Time to get data frame: {:02d}:{:02d}:{:02d}'.format(
               elapsed_time // 3600,
               (elapsed_time % 3600 // 60),
               elapsed_time % 60))
    return new_df


def get_datasets(input_path, num, sample_percent=20, sample=True):
    datasets = []
    if sample:
        for i in range(1, num + 1):
            print('generating sampled dataset for num : %s' % i)
            datasets.append(read_fast(input_path, sample_percent=sample_percent))
    else:
        datasets.append(read_fast(input_path, sample = False))
    return datasets
    

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
% matplotlib inline
import  plotly.plotly as py

In [None]:
data = read_fast("../input/train_V2.csv",sample_percent=30 )
data.head(10)

In [None]:
test_data = read_fast("../input/test_V2.csv", sample = False)

Feature Engineering 

In [None]:
def dostuff(data):
    data['headshotrate'] = data['kills']/data['headshotKills']
    data['killStreakrate'] = data['killStreaks']/data['kills']
    data['healthitems'] = data['heals'] + data['boosts']
    data['totalDistance'] = data['rideDistance'] + data["walkDistance"] + data["swimDistance"]
    data['killPlace_over_maxPlace'] = data['killPlace'] / data['maxPlace']
    data['headshotKills_over_kills'] = data['headshotKills'] / data['kills']
    data['distance_over_weapons'] = data['totalDistance'] / data['weaponsAcquired']
    data['walkDistance_over_heals'] = data['walkDistance'] / data['heals']
    data['walkDistance_over_kills'] = data['walkDistance'] / data['kills']
    data['killsPerWalkDistance'] = data['kills'] / data['walkDistance']
    data["kill_skill"] = data["headshotKills"] + data["roadKills"]
    data['killsWithoutMoving'] = ((data['kills'] > 0) & (data['totalDistance'] == 0))
    return data

data= dostuff(data)
test_data = dostuff(test_data)

In [None]:
#test_data['headshotrate'] = test_data['kills']/test_data['headshotKills']
#test_data['killStreakrate'] = test_data['killStreaks']/test_data['kills']
#test_data['healthitems'] = test_data['heals'] + test_data['boosts']
#test_data['totalDistance'] = test_data['rideDistance'] + test_data["walkDistance"] + test_data["swimDistance"]
#test_data['killPlace_over_maxPlace'] = test_data['killPlace'] / test_data['maxPlace']
#test_data['headshotKills_over_kills'] = test_data['headshotKills'] / test_data['kills']
#test_data['distance_over_weapons'] = test_data['totalDistance'] / test_data['weaponsAcquired']
#test_data['walkDistance_over_heals'] = test_data['walkDistance'] / test_data['heals']
#test_data['walkDistance_over_kills'] = test_data['walkDistance'] / test_data['kills']
#test_data['killsPerWalkDistance'] = test_data['kills'] / test_data['walkDistance']
#test_data["kill_skill"] = test_data["headshotKills"] + test_data["roadKills"]

**Killing without moving**
checking if people are getting kills without moving.


In [None]:

#test_data['killsWithoutMoving'] = ((test_data['kills'] > 0) & (test_data['totalDistance'] == 0))

**Outlier Detection **


In [None]:
# Remove outliers
data.drop(data[data['killsWithoutMoving'] == True].index, inplace=True)
#data2.drop(data2[data2['killsWithoutMoving'] == True].index, inplace=True)
# Players who got more than 10 roadKills
data.drop(data[data['roadKills'] > 10].index, inplace=True)
#data2.drop(data2[data2['roadKills'] > 10].index, inplace=True)

data.drop(data[data['kills'] > 30].index, inplace=True)
#data2.drop(data2[data2['kills'] > 30].index, inplace=True)

data.drop(data[data['longestKill'] >= 1000].index, inplace=True)
#data2.drop(data2[data2['longestKill'] >= 1000].index, inplace=True)

data.drop(data[data['heals'] >= 40].index, inplace=True)
#data2.drop(data2[data2['heals'] >= 40].index, inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(15,15)) 
sns.heatmap(data1.corr(), cmap ='RdBu')

In [None]:
correlations = data.corr().abs()
correlations = correlations["winPlacePerc"].sort_values(ascending=False)
features = correlations.index[1:6]
correlations.head(10)

In [None]:
#'winPlacePerc' CorrelationMatrix
corrmat = data1.corr().abs()
k = 10
cols = corrmat.nlargest(k , 'winPlacePerc')['winPlacePerc'].index
cm = np.corrcoef(data[cols].values.T)
fig, ax = plt.subplots(figsize=(10,10))  
hm = sns.heatmap(cm ,annot=True, cmap = "RdBu",cbar = True,square = True,
                 yticklabels = cols.values, xticklabels = cols.values, ax = ax)

**POSITIVE CORRELATION:**
If an increase in feature A leads to increase in feature B, then they are positively correlated. A value 1 means perfect positive correlation.

**NEGATIVE CORRELATION: **
If an increase in feature A leads to decrease in feature B, then they are negatively correlated. A value -1 means perfect negative correlation.

Working with numeric features

In [None]:
numeric_features = data.select_dtypes(include=[np.number])
numeric_features.dtypes

In [None]:
corr = numeric_features.corr()

print (corr['winPlacePerc'].sort_values(ascending=False)[:5], '\n')
print (corr['winPlacePerc'].sort_values(ascending=False)[-5:])

In [None]:
# Missing Data
total = data.isnull().sum().sort_values(ascending = False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total,percent], axis = 1, keys = ['Total', 'Percent'])
missing_data.head(20)

In [None]:
categoricals = data.select_dtypes(exclude=[np.number])
categoricals.describe()

In [None]:
#data1.drop(2744604, inplace =True)
#data2.drop(2744604, inplace =True)

In [None]:
#data1 = data1.fillna(data1.mean())
data= data.dropna()

In [None]:
#data = data.fillna(data.mean())
#data2 = data2.dropna()

In [None]:
#data = pd.concat([data1, data2], ignore_index=True)


In [None]:
data = data.drop(columns=['groupId','matchId' ,'matchType'], axis = 1)
test_data = test_data.drop(columns=['groupId','matchId' ,'matchType'], axis = 1)

In [None]:
y = data.winPlacePerc
X = data.drop(['winPlacePerc', 'Id'], axis=1)

In [None]:
y[y < 0] = 0
y[y >1] = 1

In [None]:
print(X.shape)
print(y.shape)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

def identify_zero_importance_features(X, y, iterations = 2):
    """
    Identify zero importance features in a training dataset based on the 
    feature importances from a gradient boosting model. 
    
    Parameters
    --------
    train : dataframe
        Training features
        
    train_labels : np.array
        Labels for training data
        
    iterations : integer, default = 2
        Number of cross validation splits to use for determining feature importances
    """
    
    # Initialize an empty array to hold feature importances
    feature_importances = np.zeros(X.shape[1])

    # Create the model with several hyperparameters
    model = lgb.LGBMRegressor(objective='regression', boosting_type = 'goss', 
                               n_estimators =6000, class_weight = 'balanced')
    
    # Fit the model multiple times to avoid overfitting
    for i in range(iterations):

        # Split into training and validation set
        train_features, valid_features, train_y, valid_y = train_test_split(X, y, 
                                                                            test_size = 0.25, 
                                                                            random_state = i)

        # Train using early stopping
        model.fit(train_features, train_y, early_stopping_rounds=100, 
                  eval_set = [(valid_features, valid_y)])

        # Record the feature importances
        feature_importances += model.feature_importances_ / iterations
    
    feature_importances = pd.DataFrame({'feature': list(X.columns), 
                            'importance': feature_importances}).sort_values('importance', 
                                                                            ascending = False)
    
    # Find the features with zero importance
    zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
    print('\nThere are %d features with 0.0 importance' % len(zero_features))
    
    return zero_features, feature_importances

zero_features, feature_importances = identify_zero_importance_features(X, y, iterations = 2)
print('zero_features:',zero_features)
print('feature_importances : ', feature_importances)

In [None]:
feature_importances.describe()

In [None]:
pp =np.percentile(feature_importances['importance'], 25) 
print(pp)

In [None]:
to_drop = feature_importances[feature_importances['importance'] <= pp]['feature']
X = X.drop(columns = to_drop)

In [None]:
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
from lightgbm import LGBMRegressor

gbm = LGBMRegressor(objective='regression',
                              num_leaves=40,
                              learning_rate=0.05, 
                              n_estimators=20000,
                              max_bin=55, 
                              bagging_fraction=0.7,
                              bagging_freq=9, 
                              feature_fraction=0.7,
                              feature_fraction_seed=9, 
                              bagging_seed=10,
                              min_data_in_leaf=7, 
                              min_sum_hessian_in_leaf=5)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='rmsle',
        early_stopping_rounds=100)
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
print('The accuracy of the lgbm Regressor is',r2_score(y_test,y_pred))
print ('RMSE is: \n', mean_squared_error(y_test,y_pred))

In [None]:
feats = test_data.drop(['Id'], axis=1)

feats = feats[X_train.columns]
final_preds = gbm.predict(feats,num_iteration=gbm.best_iteration_)

In [None]:
submission = pd.DataFrame()
submission['Id'] = test_data.Id
submission['winPlacePerc'] = final_preds 
submission.to_csv('submission1.csv', index=False)

In [None]:
submission.head(20)