# Import

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Read Data

In [None]:
# Reduce the usage of memory
# Ref: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    '''
    iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.        
    '''
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
develop_mode = False
if develop_mode:
    df_train = reduce_mem_usage(pd.read_csv('../input/train_V2.csv', nrows=5000))
    df_test = reduce_mem_usage(pd.read_csv('../input/test_V2.csv'))
else:
    df_train = reduce_mem_usage(pd.read_csv('../input/train_V2.csv'))
    df_test = reduce_mem_usage(pd.read_csv('../input/test_V2.csv'))

In [None]:
print('The sizes of the datasets are:')
print('Training Dataset: ', df_train.shape)
print('Testing Dataset: ', df_test.shape)

In [None]:
# Get Sample Data
df_train.head(10)

# Exploratory Data Analysis

There exist many features in the given dataset and each of them indicates the different types of events for players.

In [None]:
def visualize(col_name, num_bin=10):
    '''
    Function for visualization
    '''
    title_name = col_name[0].upper() + col_name[1:]
    f, ax = plt.subplots()
    plt.xlabel(title_name)
    plt.ylabel('log Count')
    ax.set_yscale('log')
    df_train.hist(column=col_name,ax=ax,bins=num_bin)
    plt.title('Histogram of ' + title_name)
    tmp = df_train[col_name].value_counts().sort_values(ascending=False)

    print('Min value of ' + title_name + ' is: ',min(tmp.index))
    print('Max value of ' + title_name + ' is: ',max(tmp.index))

### Player Group Analysis

Same matchId indicates that these records are generated from one match. With the same matchId, same groupId means that these records are obtained from players of same team in double play mode or squad mode.

In [None]:
group_tmp = df_train[df_train['matchId']=='df014fbee741c6']['groupId'].value_counts().sort_values(ascending=False)

In [None]:
plt.figure()
plt.bar(group_tmp.index,group_tmp.values)
plt.xlabel('GroupId')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.title('Number of Group Members in One Match')
plt.show()

print('Min number of group members is: ',min(group_tmp.values))
print('Max number of group members is: ',max(group_tmp.values))

### Assists Analysis

Assists in the match means that the player contributes damage to the killed enemy but he/she is not the player that make the last attack to kill that enemy. Once the contribution is made and the enemy is killed by other player, the count of assists will plus one.

In [None]:
visualize('assists')

### RoadKills Analysis

RoadKills is the count of killing enemyies when the player is in a vehicle.

In [None]:
visualize('roadKills')

### KillStreaks Analysis

KillStreaks is the count of killing enemies continuously in a small range of time.

In [None]:
visualize('killStreaks')

### TeamKills Analysis

TeamKills indicates the count of a player killing teammates.

In [None]:
visualize('teamKills')

### LongestKill Analysis

LongestKill is the longest distance for the distance between a player and the enemy killed by him. 

In [None]:
visualize('longestKill',num_bin=100)

### Weapons Acquired Analysis

There are many types of weapons exist in this game. Players need to search them and then they can use them for battle. Weapons Acquired indicates the number of weapons each player acquired per match.

In [None]:
visualize('weaponsAcquired',num_bin=30)

### HeadShot Kill Analysis

If a player kill enemies by shooting their head, these kills will be counted as HeadShot Kills, which will make more damage than shooting other part of enemy's body.

In [None]:
visualize('headshotKills',num_bin=30)

### DBNO Analysis

DBNO indeicates the number of enemies that the player knocks down. Knocking down enemies means making damage to them and their health points go down to zero. After that, the enemies can only crawl and will not be able to fight anymore except that their teammates cure them. If the player continue making damage to the knocked enemies and make their health points zero again, the enemies will be killed and eliminated.

In [None]:
visualize('DBNOs',num_bin=50)

### Boost and Heal Analysis

Boost indicates the number of times of using boosting items such as energy drink, which will help increase the moving speed and recover the health point slowly. Heal indicates using healing items such as medical bandage, which will recover the health point instantly after waiting for casting time for several seconds.

In [None]:
visualize('boosts',num_bin=30)

In [None]:
visualize('heals',num_bin=80)

### DamageDealt Analysis

DamageDealt indicates the total amount of damage that player makes. The damage that players hurt themselves (such as throwing buring bottle but accidently hurt themselves) is not included.

In [None]:
visualize('damageDealt',num_bin=1000)

### Revives Analysis

Revives means the number of reviving teammates. If a teammate is knoekced down but not killed, player can revive that teammate and the count of reviving will be added one.

In [None]:
visualize('revives',num_bin=40)

### Distence Analysis

There are three types of distance value in the given dataset. WalkDistance is the total travelling distance on foot measured in meters. RideDistance is the total travelling distance on vehicles measured in meters. SwimDistance is the total travelling distance by swimming measured in meters.

In [None]:
visualize('walkDistance',num_bin=260)

In [None]:
visualize('rideDistance',num_bin=400)

In [None]:
visualize('swimDistance',num_bin=100)

### VehicleDestroys Analysis

VehicleDestroys Analysis is the number of vehicles that players destroy. In this game, players are able to destroy vehicles by shooting them or throwing bombs to them.

In [None]:
visualize('vehicleDestroys',num_bin=5)

### Missing Value Analysis

In [None]:
def MissValueAnalysis():
    miss_total = df_train.isnull().sum().sort_values(ascending=False)
    miss_percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([miss_total, miss_percent], axis=1, keys=['total', 'percent'])

    percent_data = miss_percent.head(20)
    percent_data.plot(kind="bar")
    plt.xlabel("Columns")
    plt.ylabel("Percentage")
    plt.title("Total Missing Value (%) in Training Data")
    plt.show()

    miss_total = df_test.isnull().sum().sort_values(ascending=False)
    miss_percent = (df_test.isnull().sum()/df_test.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([miss_total, miss_percent], axis=1, keys=['total', 'percent'])

    percent_data = miss_percent.head(20)
    percent_data.plot(kind="bar")
    plt.xlabel("Columns")
    plt.ylabel("Percentage")
    plt.title("Total Missing Value (%) in Training Data")
    plt.show()
    
MissValueAnalysis()

It could be observed that there is missing value in training data and no missing value in testing data.

### Correlation Analysis

In [None]:
def CorrelationAnalysis():
    corr = df_train.corr()
    f, ax = plt.subplots(figsize=(15, 15))
    heatmap = sns.heatmap(corr,cbar=True, annot=True, 
                          square=True, fmt='.2f', 
                          cmap='YlGnBu')
    
CorrelationAnalysis()

From correlation heatmap, it could be observed that several features have high correlation with the label winPlacePerc. Let's look at their correlation by visualization.

walkDistance vs winPlacePerc

In [None]:
df_train.plot(x="walkDistance",y="winPlacePerc", kind="scatter", figsize=(8,6), title='walkDistance vs winPlacePerc')

heals vs winPlacePerc

In [None]:
df_train.plot(x="heals",y="winPlacePerc", kind="scatter", figsize=(8,6), title='heals vs winPlacePerc')

In [None]:
def HealsVSwinPlacePerc():
    f, ax = plt.subplots(figsize=(8, 6))
    fig = sns.boxplot(x='boosts', y="winPlacePerc", data=df_train)
    plt.title('heals vs winPlacePerc box plot')
    fig.axis(ymin=0, ymax=1)
    
HealsVSwinPlacePerc()

longestKill vs winPlacePerc

In [None]:
df_train.plot(x="longestKill",y="winPlacePerc", kind="scatter", figsize = (8,6), title='longestKill vs winPlacePerc')

# Feature Engineering

The player individual behavior in the match is connected to the statistical of group and the statistical of current match. In this way, the statistical of group and match will be analyzed and added to the fefature engineering.

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import gc, sys

def BuildFeature(is_train=True):
    '''
    Function for feature engineering
    is_train incicates whether the train set or the test set is processed
    '''
    y = None
    test_idx = None
    
    if is_train: 
        print("Reading train.csv")
        df = pd.read_csv('../input/train_V2.csv')           
        df = df[df['maxPlace'] > 1]
    else:
        print("Reading test.csv")
        df = pd.read_csv('../input/test_V2.csv')
        test_idx = df.Id
    
    # Reduce the memory usage
    df = reduce_mem_usage(df)
    
    print("Delete Unuseful Columns")
    target = 'winPlacePerc'
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")  
    
    if is_train: 
        print("Read Labels")
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    print("Read Group mean features")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    if is_train:
        df_out = agg.reset_index()[['matchId','groupId']]
    else:
        df_out = df[['matchId','groupId']]
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])

    print("Read Group max features")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    print("Read Group min features")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    print("Read Group size features")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    print("Read Match mean features")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print("Read Match size features")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    X = df_out
    feature_names = list(df_out.columns)
    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y, feature_names, test_idx

In [None]:
X_train, y_train, train_columns, _ = BuildFeature(is_train=True)
X_test, _, _ , test_idx = BuildFeature(is_train=False)

In [None]:
X_train =reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

# Model Building

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
LR_model = LinearRegression(n_jobs=4, normalize=True)
LR_model.fit(X_train,y_train)

In [None]:
LR_model.score(X_train,y_train)

In [None]:
y_pred_train = LR_model.predict(X_train)
y_pred_test = LR_model.predict(X_test)

Visualize Prediction Result for training set

In [None]:
y_pred_train[y_pred_train>1] = 1
y_pred_train[y_pred_train<0] = 0

f, ax = plt.subplots(figsize=(10,10))
plt.scatter(y_train, y_pred_train)
plt.xlabel("y")
plt.ylabel("y_pred_train")
plt.xlim([0,1])
plt.ylim([0,1])
plt.show()

Submit prediction for testing data

In [None]:
y_pred_test[y_pred_test>1] = 1
y_pred_test[y_pred_test<0] = 0

In [None]:
df_test['winPlacePerc'] = y_pred_test
submission = df_test[['Id', 'winPlacePerc']]
submission.to_csv('submission_lr.csv', index=False)

### Gradient Boost Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor(loss='ls',learning_rate=0.1,
                                n_estimators=100,max_depth=3)
GBR.fit(X_train,y_train)

In [None]:
GBR.score(X_train,y_train)

Visualize Prediction Result

In [None]:
y_pred_train = GBR.predict(X_train)
y_pred_test = GBR.predict(X_test)

In [None]:
y_pred_train[y_pred_train>1] = 1
y_pred_train[y_pred_train<0] = 0

f, ax = plt.subplots(figsize=(10,10))
plt.scatter(y_train, y_pred_train)
plt.xlabel("y")
plt.ylabel("y_pred_train")
plt.xlim([0,1])
plt.ylim([0,1])
plt.show()

Submit prediction for testing data

In [None]:
df_test['winPlacePerc'] = y_pred_test
submission = df_test[['Id', 'winPlacePerc']]
submission.to_csv('submission_gbr.csv', index=False)