In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import gc
# This will prompt for authorization.
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def reduce_mem_usage(df):
    # iterate through all the columns of a dataframe and modify the data type
    #   to reduce memory usage.        
    
#     start_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        
        col_type = str(df[col].dtype)
        if col_type == 'object':
            col_type = 'str'
        else:
            col_type = 'np.' + col_type

#     end_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
#     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [0]:
pubg_data = pd.read_csv('/content/drive/My Drive/DL_final_project/pubg-finish-placement-prediction/train_V2.csv')

In [0]:
pubg_data = reduce_mem_usage(pubg_data)

In [0]:
pubg_data = pubg_data.fillna(0).reset_index(drop = True)

In [6]:
pubg_data_oh = pd.concat([pubg_data,pd.get_dummies(pubg_data['matchType'])],axis = 1)
del pubg_data
gc.collect()

0

In [0]:
#pubg_data_sample = pubg_data_oh.sample(frac = 0.1)

In [0]:
def perf_features(X,train=False):
    X['headshot_rate'] = X['headshotKills'] / (X['kills'] + 0.0001)
    X['kill_streak_rate'] = X['killStreaks'] / (X['kills'] + 0.0001)
    X['kills_assists'] = X['kills'] + X['assists']
    X['heals_boosts'] = X['heals'] + X['boosts']
    X['total_distance'] = X['walkDistance'] + X['rideDistance'] + X['swimDistance']
    X['totalDistance_weaponsAcq_Ratio'] = X['total_distance'] / (X['weaponsAcquired'] + 1)
    X['walkDistance_heals_Ratio'] = X['walkDistance'] / (X['heals'] + 1)
    X['walkDistance_kills_Ratio'] = X['walkDistance'] / (X['kills'] + 0.0001)
    X['kills_walkDistance_Ratio'] = X['kills'] / (X['walkDistance'] + 0.0001)
    X['kills_assists_per_heal_boost'] = X['kills_assists'] / (X['heals_boosts'] + 1)
    X['damageDealt_per_heal_boost'] = X['damageDealt'] / (X['heals_boosts'] + 1)
    X['road_kills_per_rideDistance'] = X['roadKills'] / (X['rideDistance'] + 0.0001)
    X['maxPlace_per_numGroups'] = X['maxPlace'] /( X['numGroups'] + 1 )
    X['assists_per_kill'] = X['assists'] / (X['kills'] + X['assists'] + 0.0001)
    X['killPlace'] = X['killPlace'] - 1
    X['total_Distance_Per_Duration'] =  X["total_distance"]/(X["matchDuration"] + 0.0001)
    X['walk_Distance_Per_Duration'] =  X["walkDistance"]/(X["matchDuration"] + 0.0001)
    X['kills_Per_Duration'] =  X["kills"]/(X["matchDuration"] + 0.0001)
    X[X == np.Inf] = np.NaN
    X[X == np.NINF] = np.NaN
    X.fillna(0, inplace=True)

    data = X
    feature = list(data.columns)
    feature.remove('Id')
    feature.remove('groupId')
    feature.remove('matchId')
    feature.remove('matchType')
    if(train):
      labels = np.array(data.groupby(['matchId','groupId'])['winPlacePerc'].agg('mean'), dtype=np.float64)
      feature.remove('winPlacePerc')
    else: 
      labels = data[['Id']]
    
    print("group_max")
    agg = data.groupby(['matchId','groupId'])[feature].agg('max')
    agg_rank = agg.groupby('matchId')[feature].rank(pct=True).reset_index()
    if train: data_out = agg.reset_index()[['matchId','groupId']]
    else: data_out = data[['matchId','groupId']]
    data_out = data_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId','groupId'])
    data_out = data_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId','groupId'])
    
    print("group_mean")
    agg = data.groupby(['matchId','groupId'])[feature].agg('mean')
    agg_rank = agg.groupby('matchId')[feature].rank(pct=True).reset_index()
    data_out = data_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId','groupId'])
    data_out = data_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId','groupId'])
    
    print("group_min")
    agg = data.groupby(['matchId','groupId'])[feature].agg('min')
    agg_rank = agg.groupby('matchId')[feature].rank(pct=True).reset_index()
    data_out = data_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId','groupId'])
    data_out = data_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId','groupId'])
    
    print("match_mean")
    agg = data.groupby(['matchId'])[feature].agg('mean').reset_index()
    data_out = data_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print("match_max")
    agg = data.groupby(['matchId'])[feature].agg('max').reset_index()
    data_out = data_out.merge(agg, suffixes=["", "_match_max"], how='left', on=['matchId'])
    
    print("match_size")
    agg = data.groupby(['matchId']).size().reset_index(name='match_size')
    data_out = data_out.merge(agg, how='left', on=['matchId'])
    

    del data,agg,agg_rank
    gc.collect()
    data_out.drop(["matchId", "groupId"], axis=1, inplace=True)

    #data_out = reduce_size(data_out)
    X = data_out
    del data_out, feature
    gc.collect()
    return X,labels


In [9]:
#X = pubg_data_oh.drop(['Id','groupId','matchId','matchType','winPlacePerc'],axis =1)
#features = list(X.columns)
#y = pubg_data_oh['winPlacePerc'].values
X,y = perf_features(pubg_data_oh,True)
del pubg_data_oh
gc.collect()
#X_sample = pubg_data_sample.drop(['Id','groupId','matchId','matchType','winPlacePerc'],axis =1).values
#y_sample = pubg_data_sample['winPlacePerc'].values


group_max
group_mean
group_min
match_mean
match_max
match_size


0

In [0]:

# from sklearn.preprocessing import  MinMaxScaler
# mm= MinMaxScaler()
# X= mm.fit_transform(X)
# y = y

In [11]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
del X,y
gc.collect()

0

In [12]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)
del reg
gc.collect()
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_pred,y_test)


0.04154452298007842

In [0]:
import lightgbm as lgb

In [14]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test)
# specify your configurations as a dict
params = {
    'objective': 'regression',
    'metric': 'mae',
    'num_leaves': 150,
    'learning_rate': 0.05,
    'num_threads':4,
    'min_split_gain':0.0002,
    'bagging_fraction': 0.5,
    "bagging_seed" : 0,
    'min_data_in_leaf':2000, 
    'verbose': 0,
    "colsample_bytree" : 0.5,
    'lamda_l2':8
}
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=[lgb_train,lgb_eval],
                verbose_eval=100,
                early_stopping_rounds=20)

del lgb_train,lgb_eval
gc.collect()

Starting training...
Training until validation scores don't improve for 20 rounds.
[100]	training's l1: 0.0323617	valid_1's l1: 0.0326145
[200]	training's l1: 0.0293886	valid_1's l1: 0.0297861
[300]	training's l1: 0.0281615	valid_1's l1: 0.0286921
[400]	training's l1: 0.0273798	valid_1's l1: 0.0280432
[500]	training's l1: 0.0268098	valid_1's l1: 0.0276064
[600]	training's l1: 0.0263686	valid_1's l1: 0.0272999
[700]	training's l1: 0.0260017	valid_1's l1: 0.0270591
[800]	training's l1: 0.0256917	valid_1's l1: 0.0268759
[900]	training's l1: 0.0254062	valid_1's l1: 0.0267094
[1000]	training's l1: 0.0251453	valid_1's l1: 0.0265654
[1100]	training's l1: 0.0249211	valid_1's l1: 0.026459
[1200]	training's l1: 0.0247006	valid_1's l1: 0.0263498
[1300]	training's l1: 0.0244982	valid_1's l1: 0.026261
[1400]	training's l1: 0.024309	valid_1's l1: 0.0261842
[1500]	training's l1: 0.024118	valid_1's l1: 0.0260983
[1600]	training's l1: 0.0239426	valid_1's l1: 0.0260285
[1700]	training's l1: 0.0237736	va

372

In [15]:
print('Starting predicting...')
# predict

y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
mean_absolute_error(y_pred,y_test)

Starting predicting...


0.025807159395492454

In [16]:
features = X_train.columns
featureImp = list(gbm.feature_importance())
featureImp, features = zip(*sorted(zip(featureImp, features)))
feature_imp_dict = {}
for i in range(len(featureImp)):
    feature_imp_dict[features[i]] =  featureImp[i]
del gbm
gc.collect()

0

In [0]:
import collections

sorted_dict = collections.OrderedDict(feature_imp_dict)

In [18]:
count_feature_0 = 0
#print(sorted_dict)
for items in list(sorted_dict.items()):
  #print(items)
  if(items[1] == 0):
    count_feature_0 = count_feature_0 + 1

print(count_feature_0)

99


In [0]:
top_half = list(sorted_dict.items())[::-1][0:int(1*len(sorted_dict.items()))]
top_half_feat = []

for feat in top_half:
  top_half_feat.append(feat[0])

#top_half_feat

In [0]:
# X_train_feat = X_train[top_half_feat]
# X_test_feat = X_test[top_half_feat]

# from sklearn.linear_model import LinearRegression
# reg = LinearRegression()
# reg.fit(X_train_feat,y_train)
# y_pred = reg.predict(X_test_feat)
# del reg
# gc.collect()
# from sklearn.metrics import mean_absolute_error
# mean_absolute_error(y_pred,y_test)

In [21]:
from keras import Sequential
from keras.layers import Dense,BatchNormalization,Activation,Dropout, LeakyReLU
def build_regressor():
    regressor = Sequential()
    
    regressor.add(Dense(units=256, input_dim=X_train.shape[1]))
    regressor.add(BatchNormalization())
    regressor.add(LeakyReLU(alpha = 0.1))
    regressor.add(Dropout(0.5))
    regressor.add(Dense(units=128))
    regressor.add(BatchNormalization())
    regressor.add(LeakyReLU(alpha = 0.1))
    regressor.add(Dropout(0.4))
    regressor.add(Dense(units=64))
    regressor.add(BatchNormalization())
    regressor.add(LeakyReLU(alpha = 0.1))
    regressor.add(Dropout(0.3))
    regressor.add(Dense(units=1))
    regressor.compile(optimizer='adam', loss='mean_squared_error',  metrics=['mae'])
    return regressor

Using TensorFlow backend.


In [0]:
from keras.wrappers.scikit_learn import KerasRegressor
regressor = KerasRegressor(build_fn=build_regressor, batch_size=64,epochs=50)

In [23]:
results=regressor.fit(X_train.values,y_train)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




Epoch 1/50





Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50

In [0]:
y_pred= regressor.predict(X_test)


In [25]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_pred,y_test)

0.07442004981755747

In [26]:
del regressor
gc.collect()

51

In [46]:
from sklearn.tree import DecisionTreeRegressor
regr = DecisionTreeRegressor(max_depth=100, random_state=0, max_features = 50, max_leaf_nodes=10)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_pred,y_test)

0.08342486858348738