<a href="https://colab.research.google.com/github/sarmientoj24/EE298/blob/master/PUBG_FINAL-FeatureEnggv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import gc
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import random
random.seed(42)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
!chmod 600 '/content/drive/My Drive/pubg/train/train_V2.csv'
INPUT_DIR = '/content/drive/My Drive/pubg'

# Specific imports
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Helper Functions
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    return df

def reload():
  print("Building dataframe...")
  gc.collect()
  df = reduce_mem_usage(pd.read_csv(INPUT_DIR + '/train/train_V2.csv')) # <=========== Just a function to reduce memory usage

  # Only take the samples with matches that have more than 1 player 
  # there are matches with no players or just one player ( those samples could affect our model badly) 
  df = df[df['maxPlace'] > 1]
  invalid_match_ids = df[df['winPlacePerc'].isna()]['matchId'].values
  df = df[-df['matchId'].isin(invalid_match_ids)]
  print("Done loading train to dataframe...")
  return df

def train_test_split(df, test_size=0.1):
  match_ids = df['matchId'].unique().tolist()
  train_size = int(len(match_ids) * (1 - test_size))
  train_match_ids = random.sample(match_ids, train_size)

  train = df[df['matchId'].isin(train_match_ids)]
  test = df[-df['matchId'].isin(train_match_ids)]

  return train, test
  
# Split train to train and eval set
def generate_train_test_set(df, split):
  print("Generating train and test set...")
  df.drop(columns=['matchType'], inplace=True)
  
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType']
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, split)
  
  return train[cols_to_fit], val[cols_to_fit]

def generate_train_set(df):
  print("Generating train and test set...")
  df.drop(columns=['matchType'], inplace=True)
  
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType']
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train = df
  
  return train[cols_to_fit]

def load_test():
  print("Building dataframe...")
  df = reduce_mem_usage(pd.read_csv(INPUT_DIR + '/test/test_V2.csv')) # <=========== Just a function to reduce memory usage

  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType']
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  print("Done loading train to dataframe...")
  return df[cols_to_fit]

def transform_preds(df_test, pred):
  for i in range(len(df_test)):
      winPlacePerc_m = pred[i]
      maxPlace = int(df_test.iloc[i]['maxPlace'])
      if maxPlace == 0:
          winPlacePerc_m = 0.0
      elif maxPlace == 1:
          winPlacePerc_m = 1.0
      else:
          gap = 1.0 / (maxPlace - 1)
          winPlacePerc_m = np.round(winPlacePerc_m / gap) * gap

      if winPlacePerc_m < 0: winPlacePerc_m = 0.0
      if winPlacePerc_m > 1: winPlacePerc_m = 1.0    
      pred[i] = winPlacePerc_m

      if (i + 1) % 100000 == 0:
          print(i, flush=True, end=" ")

  df_test['winPlacePerc_mod'] = pred
  return df_test


def run_experiment(preprocess):
    df = reload()
    df.drop(columns=['matchType'], inplace=True)
    
    df = preprocess(df)

    score = run_lgb2(df)
    return score

def run_experiments(preprocesses):
    results = []
    for preprocess in preprocesses:
        start = time.time()
        score = run_experiment(preprocess)
        execution_time = time.time() - start
        results.append({
            'name': preprocess.__name__,
            'score': score,
            'execution time': f'{round(execution_time, 2)}s'
        })
        gc.collect()
        
    return pd.DataFrame(results, columns=['name', 'score', 'execution time']).sort_values(by='score')


def save_for_submission(df, path):
  submission = df_test[['Id', 'winPlacePerc']]
  submission.to_csv(path + 'submission.csv', index=False)

In [0]:
# Feature Selectors
import lightgbm as lgb

# Feature Selectors
def run_lightgbmreg(df):
  print("LightGBM: Start Light Gradient Boosted Regression...")

  target = 'winPlacePerc'
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, 0.1)

  params = {
      'n_estimators': 100,
      'learning_rate': 0.3, 
      'num_leaves': 20,
      'objective': 'regression_l2', 
      'metric': 'mae',
      'verbose': -1,
  }

  model = LGBMRegressor(**params)
  model.fit(
      train[cols_to_fit], train[target],
      eval_metric='mae',
      verbose=20,
  )
  
  y_true = val[target]
  y_pred = model.predict(val[cols_to_fit])
  
  print("LightGBM: Selecting features")
  feature_importance = pd.DataFrame(sorted(zip(model.feature_importances_, cols_to_fit), reverse=True), columns=['Value','Feature'])
  print(feature_importance)
  
  return mean_absolute_error(y_true, y_pred)

def run_lgb2(df):
  print("LightGBM: Start Light Gradient Boosted Regression...")

  target = 'winPlacePerc'
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, 0.1)

  params = {"objective" : "regression", "metric" : "mae", 'n_estimators': 200,
              "num_leaves" : 30, "learning_rate" : 0.3, "bagging_fraction" : 0.7,
               "bagging_seed" : 0, "num_threads" : 4,"colsample_bytree" : 0.7
             }

  model = LGBMRegressor(**params)
  model.fit(
      train[cols_to_fit], train[target],
      eval_metric='mae',
      verbose=20,
  )
  
  y_true = val[target]
  y_pred = model.predict(val[cols_to_fit])
  
  print("LightGBM: Selecting features")
  feature_importance = pd.DataFrame(sorted(zip(model.feature_importances_, cols_to_fit), reverse=True), columns=['Value','Feature'])
  print(feature_importance)
  
  return mean_absolute_error(y_true, y_pred)

from xgboost import XGBRegressor
def run_xgboost(df):
  print("XGBoost: Start XGBoost Regression...")

  target = 'winPlacePerc'
  cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
  cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
  train, val = train_test_split(df, 0.1)

  params = {
      'n_estimators': 40,
      'learning_rate': 0.1, 
      'num_leaves': 20,
      'objective': 'binary:logistic', 
      'metric': 'mae',
      'verbose': 20,
      'seed' : 42
  }

  model = XGBRegressor(**params)
  model.fit(
      train[cols_to_fit], train[target],
      eval_set=[(val[cols_to_fit], val[target])],
      eval_metric='mae',
      verbose=20,
  )
  
  print("XGBoost: Selecting features")
  feature_importance = pd.DataFrame(sorted(zip(model.feature_importances_, cols_to_fit), reverse=True), columns=['Value','Feature'])
  print(feature_importance)
  return model

In [0]:
# Feature Engineering
# Helper Functions

def get_playersJoined(df):
  df['playersJoined'] = df.groupby('matchId')['matchId'].transform('count')
  return df

def get_killsNorm(df):
  if 'playersJoined' not in df.columns:
    df = add_players_join(df)
  df['killsNorm'] = df['kills']*((100-df['playersJoined'])/100 + 1)
  return df

def get_damageDealtNorm(df):
  if 'playersJoined' not in df.columns:
    df = add_players_join(df)
  df['damageDealtNorm'] = df['damageDealt']*((100-df['playersJoined'])/100 + 1)
  return df

def get_healsAndBoosts(df):
  df['healsAndBoosts'] = df['heals'] + df['boosts']
  return df

def get_totalDistance(df):
  df['totalDistance'] = df['walkDistance']+ df['rideDistance']+ df['swimDistance']
  return df

def get_team(df):
  df['team'] = [1 if i>50 else 2 if (i>25 & i<=50) else 4 for i in df['numGroups']]
  return df

def get_players_in_team(df):
    agg = df.groupby(['groupId']).size().to_frame('players_in_team')
    return df.merge(agg, how='left', on=['groupId'])

def get_headshotKills_over_kills(df):
  df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
  df['headshotKills_over_kills'].fillna(0, inplace=True)
  return df

def get_killPlace_over_maxPlace(df):
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['killPlace_over_maxPlace'].fillna(0, inplace=True)
    df['killPlace_over_maxPlace'].replace(np.inf, 0, inplace=True)
    return df

def get_walkDistance_over_heals(df):
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_heals'].fillna(0, inplace=True)
    df['walkDistance_over_heals'].replace(np.inf, 0, inplace=True)
    return df
  
def get_walkDistance_over_boosts(df):
    df['walkDistance_over_boosts'] = df['walkDistance'] / df['boosts']
    df['walkDistance_over_boosts'].fillna(0, inplace=True)
    df['walkDistance_over_boosts'].replace(np.inf, 0, inplace=True)
    return df

def get_walkDistance_over_kills(df):
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['walkDistance_over_kills'].fillna(0, inplace=True)
    df['walkDistance_over_kills'].replace(np.inf, 0, inplace=True)
    return df

def get_teamwork(df):
    df['teamwork'] = df['assists'] + df['revives']
    return df

# BY AGGREGATES, meaning, they calculate mean/max/min, etc of each columns then add it into the left of the existing one
def add_min_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId','groupId'])[features].min()
    return df.merge(agg, suffixes=['', '_min'], how='left', on=['matchId', 'groupId'])

def add_max_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].max()
    return df.merge(agg, suffixes=['', '_max'], how='left', on=['matchId', 'groupId'])

def add_sum_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].sum()
    return df.merge(agg, suffixes=['', '_sum'], how='left', on=['matchId', 'groupId'])

def add_median_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].median()
    return df.merge(agg, suffixes=['', '_median'], how='left', on=['matchId', 'groupId'])

def add_mean_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean()
    return df.merge(agg, suffixes=['', '_mean'], how='left', on=['matchId', 'groupId'])

def add_rank_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean()
    agg = agg.groupby('matchId')[features].rank(pct=True)
    return df.merge(agg, suffixes=['', '_mean_rank'], how='left', on=['matchId', 'groupId'])

In [0]:
# Feature combos
# Here, you could mix-in different features by just creating your own function
# Your goal is to create a perfect mix of features here by
# 1. Create own feature column (above)
# 2. Test collection of features on a standard feature selection by run_experiments
# 3. Investigate which feature groups have high results on #2
# 4. Check among #3 which are great contributor features and select them
# 5. Run #2 again using features obtained from #4
# 6. Do until you get better results. 
def original(df):
  return df

def get_these_cols(df, remain):
  return df[remain]

def remove_these_cols(df, remove_cols):
  cols_to_remain = [col for col in df.columns if col not in remove_cols]
  return df[cols_to_remain]
  
def put_everything_and_median(df):
  df = add_players_join(df)
  df = get_killsNorm(df)
  df = get_damageDealtNorm(df)
  df = get_healsAndBoosts(df)
  df = get_totalDistance(df)
  df = get_team_category(df)
  df = get_players_in_team(df)
  df = get_headshotKills_over_kills(df)
  df = get_killPlace_over_maxPlace(df)
  df = get_walkDistance_over_heals(df)
  df = get_walkDistance_over_boosts(df)
  df = get_walkDistance_over_kills(df)
  df = get_teamwork(df)
  df = add_median_by_team(df)
  return df

def just_median(df):
  df = add_median_by_team(df)
  return df

def put_everything(df):
  df = add_players_join(df)
  df = get_killsNorm(df)
  df = get_damageDealtNorm(df)
  df = get_healsAndBoosts(df)
  df = get_totalDistance(df)
  df = get_team_category(df)
  df = get_players_in_team(df)
  df = get_headshotKills_over_kills(df)
  df = get_killPlace_over_maxPlace(df)
  df = get_walkDistance_over_heals(df)
  df = get_walkDistance_over_boosts(df)
  df = get_walkDistance_over_kills(df)
  df = get_teamwork(df)
  return df

def put_everything_and_rank(df):
  df = add_players_join(df)
  df = get_killsNorm(df)
  df = get_damageDealtNorm(df)
  df = get_healsAndBoosts(df)
  df = get_totalDistance(df)
  df = get_team_category(df)
  df = get_players_in_team(df)
  df = get_headshotKills_over_kills(df)
  df = get_killPlace_over_maxPlace(df)
  df = get_walkDistance_over_heals(df)
  df = get_walkDistance_over_boosts(df)
  df = get_walkDistance_over_kills(df)
  df = get_teamwork(df)
  df = add_rank_by_team(df)
  return df


# Finding good combinations below
# Highest ones from put_everything
def new_combination_1(df):
  df = get_killPlace_over_maxPlace(df)
  df = get_totalDistance(df)
  df = get_playersJoined(df)
  df = get_players_in_team(df)
  df = get_walkDistance_over_kills(df)
  df = get_killsNorm(df)
  df = get_walkDistance_over_boosts(df)
  df = get_healsAndBoosts(df)
  df = get_damageDealtNorm(df)
  remaining_cols = ['Id', 'groupId', 'matchId', 'winPlacePerc', 'killPlace', 'killPlace_over_maxPlace', 'walkDistance', 'totalDistance', 'playersJoined', 'players_in_team', 'numGroups', 'maxPlace', 'walkDistance_over_kills', 'matchDuration', 'killsNorm', 'walkDistance_over_boosts', 'weaponsAcquired', 'kills', 'boosts', 'DBNOs', 'longestKill', 'damageDealt', 'rideDistance', 'healsAndBoosts', 'killPoints', 'winPoints', 'rankPoints', 'killStreaks', 'damageDealtNorm']
  df = get_these_cols(df, remaining_cols)
  return df

def new_combination_2(df):
  # Just combination 1 with ranking
  '''
        Value                             Feature
  0     506                           killPlace
  1     421             totalDistance_mean_rank
  2     378                 killPlace_mean_rank
  3     377              walkDistance_mean_rank
  4     293   killPlace_over_maxPlace_mean_rank
  5     257                     kills_mean_rank
  6     254             killPlace_over_maxPlace
  7     224                        walkDistance
  8     197               killStreaks_mean_rank
  9     188   walkDistance_over_kills_mean_rank
  10    161                    boosts_mean_rank
  11    146                 killsNorm_mean_rank
  12    142           players_in_team_mean_rank
  13    136                       playersJoined
  14    135                           killsNorm
  15    132                     players_in_team
  16    114                       totalDistance
  17    108           weaponsAcquired_mean_rank
  18    107                       matchDuration
  19     90                            maxPlace
  20     84                           numGroups
  '''
  df = get_killPlace_over_maxPlace(df)
  df = get_totalDistance(df)
  df = get_playersJoined(df)
  df = get_players_in_team(df)
  df = get_walkDistance_over_kills(df)
  df = get_killsNorm(df)
  df = get_walkDistance_over_boosts(df)
  df = get_healsAndBoosts(df)
  df = get_damageDealtNorm(df)
  df = add_rank_by_team(df)
  
  to_remove = ['assists', 'heals', 'walkDistance_over_heals', 'swimDistance', 'teamwork', 'teamKills', 'revives', 'headshotKills_over_kills', 'headshotKills', 'roadKills', 'team', 'vehicleDestroys']
  df = remove_these_cols(df, to_remove)
  return df
  
  
  
  

In [6]:
def run_experiment(preprocess):
    print("Start running experiment for {}".format(preprocess.__name__))
    df = reload()
    df.drop(columns=['matchType'], inplace=True)
    
    df = preprocess(df)

    score = run_lgb2(df)
    
    del df
    gc.collect()
    return score

def run_experiments(preprocesses):
    results = []
    for preprocess in preprocesses:
        start = time.time()
        score = run_experiment(preprocess)
        execution_time = time.time() - start
        results.append({
            'name': preprocess.__name__,
            'score': score,
            'execution time': f'{round(execution_time, 2)}s'
        })
        gc.collect()
        
    return pd.DataFrame(results, columns=['name', 'score', 'execution time']).sort_values(by='score')

# Experiment Feature Selection Here
experiment_scores = run_experiments([
    put_everything_and_rank
])

# Print Scores
print(experiment_scores)


Start running experiment for new_combination_2
Building dataframe...
Done loading train to dataframe...
LightGBM: Start Light Gradient Boosted Regression...
LightGBM: Selecting features
    Value                             Feature
0     506                           killPlace
1     421             totalDistance_mean_rank
2     378                 killPlace_mean_rank
3     377              walkDistance_mean_rank
4     293   killPlace_over_maxPlace_mean_rank
5     257                     kills_mean_rank
6     254             killPlace_over_maxPlace
7     224                        walkDistance
8     197               killStreaks_mean_rank
9     188   walkDistance_over_kills_mean_rank
10    161                    boosts_mean_rank
11    146                 killsNorm_mean_rank
12    142           players_in_team_mean_rank
13    136                       playersJoined
14    135                           killsNorm
15    132                     players_in_team
16    114                       