# PUBG Prediction
2019.01.04 <br>
Damien Park


## 1. 라이브러리 및 데이터 불러오기(Importing library and Loading Data)

In [None]:
import numpy as np
import pandas as pd
import tqdm

import matplotlib.pyplot as plt

import keras
from keras.layers.core import Dense
from keras.layers.normalization import BatchNormalization

from sklearn.preprocessing import MinMaxScaler, RobustScaler, MaxAbsScaler

In [None]:
train = pd.read_csv("../input/train_V2.csv")

---

## 2. 데이터 미리보기(Exploratory Data Analysis)

In [None]:
train.head()

### 2-1. 데이터 요약(Describe)

In [None]:
train.describe().T

### 2-2. 칼럼의 이름 및 타입(Column's name and Type)

In [None]:
pd.DataFrame(train.dtypes, columns=["Type"])

### 2-3. 시각화, 히스토그램(Visualization, Histogram)

In [None]:
plt.figure(figsize=(25, 25))
for idx, v in enumerate(train.columns[train.dtypes != "O"]):
    plt.subplot(5, 5, idx+1)
    plt.hist(train[v].dropna(), bins = 50)
    plt.title(v)

plt.show()

### 2-4. 시각화, 산점도(Visualization, Scatter plot)

In [None]:
# plt.figure(figsize=(25, 25))
# for idx, v in enumerate(train.columns[train.dtypes != "O"]):
#     plt.subplot(5, 5, idx+1)
#     plt.scatter(train["winPlacePerc"], train[v], alpha=0.5)
#     plt.title(v)

# plt.show()

In [None]:
# plt.figure(figsize=(25, 25))
# for idx, v in enumerate(train.columns[train.dtypes == "float64"][:5]):
#     plt.subplot(2, 3, idx+1)
#     for i in pd.unique(train.matchType):
#         plt.scatter(train.loc[train.matchType == i, "winPlacePerc"], train.loc[train.matchType == i, v], alpha = 0.5, label = i)
#     plt.legend()
#     plt.title(v)

# plt.show()

In [None]:
# plt.figure(figsize=(25, 25))
# for idx, v in enumerate(train.columns[train.dtypes == "int"]):
#     plt.subplot(5, 4, idx+1)
#     for i in pd.unique(train.matchType):
#         plt.scatter(train.loc[train.matchType == i, "winPlacePerc"], train.loc[train.matchType == i, v], alpha = 0.5, label = i)
#     plt.legend()
#     plt.title(v)

# plt.show()

---

## 3. 데이터 전처리(Data Preprocessing)

### 3-1. 이산형 데이터(Discrete variable) 

In [None]:
print(list(train.columns[train.dtypes == "O"]))

Id <br>
게임플레이어에 할당되는 고유 값

In [None]:
print("Number of record:", len(train), "\nNumber of Unique Id:", len(pd.unique(train.Id)))

groupId <br>
duo, squad의 경우 할당되는 그룹 고유 값, solo의 경우 groupId가 각자 할당된다. <br>
같은 groupId에서는 같은 winPlacePerc가 할당된다.

matchId <br>
각 게임에 할당되는 고유 값

In [None]:
print("Number of match: ", len(pd.unique(train.matchId)), "\nNumber of match(<9): ", sum(train.groupby("matchId").size() < 9))

보통 한 게임당 100명에서 80명 정도 참가하지만 몇몇 게임에서는 그렇지 않음

In [None]:
temp = train.loc[train.matchId.isin(train.groupby("matchId").size()[train.groupby("matchId").size() < 9].index), :]
temp.loc[temp.matchId == "e263f4a227313a"]

게임참가인원 정보를 새로 생성한다. 

In [None]:
temp = pd.DataFrame(train.groupby("matchId").size(), columns=["player"])
temp.reset_index(level=0, inplace=True)

In [None]:
train = train.merge(temp, left_on="matchId", right_on="matchId")

matchType<br>
matchType에는 총 16가지의 타입이 있다. 크게 solo, duo, squad와 같은 그룹인원에 따른 구분과 fpp, tpp와 같은 플레이 시점에 따른 구분이 있다. <br>
그룹인원에 따라서 matchType_1이라는 새로운 칼럼을 생성하고  matchType_2에는 플레이 시점에 따라서 나눈정보를 새로 생성한다. 

In [None]:
print("Type: ", pd.unique(train.matchType), "\nCount: ", len(pd.unique(train.matchType)))

In [None]:
#게임인원별 분류(Division by number of player in group)
train["matchType_1"] = "-"
train.loc[(train.matchType == "solo-fpp") | 
          (train.matchType == "solo") | 
          (train.matchType == "normal-solo-fpp") | 
          (train.matchType == "normal-solo"), "matchType_1"] = "solo"

train.loc[(train.matchType == "duo-fpp") | 
          (train.matchType == "duo") | 
          (train.matchType == "normal-duo-fpp") | 
          (train.matchType == "normal-duo"), "matchType_1"] = "duo"

train.loc[(train.matchType == "squad-fpp") | 
          (train.matchType == "squad") | 
          (train.matchType == "normal-squad-fpp") | 
          (train.matchType == "normal-squad"), "matchType_1"] = "squad"

train.loc[(train.matchType == "flarefpp") | 
          (train.matchType == "flaretpp") | 
          (train.matchType == "crashfpp") | 
          (train.matchType == "crashtpp"), "matchType_1"] = "etc"

In [None]:
# 게임시점별 분류(Division by viewpoint)
train["matchType_2"] = "-"
train.loc[(train.matchType == "solo-fpp") | 
          (train.matchType == "duo-fpp") | 
          (train.matchType == "squad-fpp") | 
          (train.matchType == "normal-solo-fpp") | 
          (train.matchType == "normal-duo-fpp") | 
          (train.matchType == "normal-squad-fpp") | 
          (train.matchType == "crashfpp") | 
          (train.matchType == "flarefpp"), "matchType_2"] = "fpp"

train.loc[(train.matchType == "solo") | 
          (train.matchType == "duo") | 
          (train.matchType == "squad") | 
          (train.matchType == "normal-solo") | 
          (train.matchType == "normal-duo") | 
          (train.matchType == "normal-squad") | 
          (train.matchType == "crashtpp") | 
          (train.matchType == "flaretpp"), "matchType_2"] = "tpp"

one-hot-encoding

In [None]:
train["solo"] = 0
train["duo"] = 0
train["squad"] = 0
train["etc"] = 0

train.loc[train.matchType_1 == "solo", "solo"] = 1
train.loc[train.matchType_1 == "duo", "duo"] = 1
train.loc[train.matchType_1 == "squad", "squad"] = 1
train.loc[train.matchType_1 == "etc", "etc"] = 1

In [None]:
train["fpp"] = 0
train["tpp"] = 0

train.loc[train.matchType_2 == "fpp", "fpp"] = 1
train.loc[train.matchType_2 == "tpp", "tpp"] = 1

### 3-2. 연속형 데이터(Continuous variable)

In [None]:
print(list(train.columns[train.dtypes != "O"]))

In [None]:
feature = ["assists", "boosts", "damageDealt", "DBNOs", "headshotKills", "heals", 
           "killPlace", "killPoints", "kills", "killStreaks", "longestKill", 
           "matchDuration", "maxPlace", "rankPoints", "revives", "rideDistance", 
           "roadKills", "swimDistance", "teamKills", "vehicleDestroys", "walkDistance", "weaponsAcquired", "winPoints", "player"]

In [None]:
feature_1 = ["matchId", "assists", "boosts", "damageDealt", "DBNOs", "headshotKills", "heals", 
             "killPlace", "killPoints", "kills", "killStreaks", "longestKill", 
             "revives", "rideDistance", "roadKills", "swimDistance", "teamKills", 
             "vehicleDestroys", "walkDistance", "weaponsAcquired", "winPoints"]

In [None]:
feature_2 = ["matchDuration", "maxPlace", "rankPoints", "player", "fpp", "tpp"]

Check The NA value

In [None]:
for i in list(train.columns[train.dtypes != "O"]):
    print(i, ":", sum(train[i].isna()))

#### killPoints, winPoints

In [None]:
# for i in pd.unique(train.matchId):
#     train.loc[(train.matchId == i) & (train.killPoints == 0), "killPoints"] = np.mean(train.loc[train.matchId == i, "killPoints"])

In [None]:
# for i in pd.unique(train.matchId):
#     train.loc[(train.matchId == i) & (train.winPoints == 0), "winPoints"] = np.mean(train.loc[train.matchId == i, "winPoints"])

#### winPlacePerc

In [None]:
np.sum(train.winPlacePerc.isna())

There is only one missing value in winPlacePerc <br>
So, we have to remove it

In [None]:
train = train.loc[train.winPlacePerc.notna(), :]

---

### 3-2-2. 정규화(Normalization)

#### 방법1-1 (게임타입별로 정규화, 그룹인원기준)
Normalization by matchType, number of player in a match

In [None]:
# solo_minmax = MinMaxScaler()
# duo_minmax = MinMaxScaler()
# squad_minmax = MinMaxScaler()
# etc_minmax = MinMaxScaler()

In [None]:
# solo_minmax.fit(train.loc[train.matchType_1 == "solo", feature])
# duo_minmax.fit(train.loc[train.matchType_1 == "duo", feature])
# squad_minmax.fit(train.loc[train.matchType_1 == "squad", feature])
# etc_minmax.fit(train.loc[train.matchType_1 == "etc", feature])

In [None]:
# solo_scale = solo_minmax.transform(train.loc[train.matchType_1 == "solo", feature])
# duo_scale = duo_minmax.transform(train.loc[train.matchType_1 == "duo", feature])
# squad_scale = squad_minmax.transform(train.loc[train.matchType_1 == "squad", feature])
# etc_scale = etc_minmax.transform(train.loc[train.matchType_1 == "etc", feature])

In [None]:
# solo_scale = pd.DataFrame(solo_scale, columns=feature)
# duo_scale = pd.DataFrame(duo_scale, columns=feature)
# squad_scale = pd.DataFrame(squad_scale, columns=feature)
# etc_scale = pd.DataFrame(etc_scale, columns=feature)

In [None]:
# _ = train.loc[train.matchType_1 == "solo", ["matchId", "matchType_1", "matchType_2", "solo", "duo", "squad", "etc", "fpp", "tpp", "winPlacePerc"]]
# _ = _.reset_index()
# solo_scale = pd.concat([solo_scale, _], axis=1)

In [None]:
# _ = train.loc[train.matchType_1 == "duo", ["matchId", "matchType_1", "matchType_2", "solo", "duo", "squad", "etc", "fpp", "tpp", "winPlacePerc"]]
# _ = _.reset_index()
# duo_scale = pd.concat([duo_scale, _], axis=1)

In [None]:
# _ = train.loc[train.matchType_1 == "squad", ["matchId", "matchType_1", "matchType_2", "solo", "duo", "squad", "etc", "fpp", "tpp", "winPlacePerc"]]
# _ = _.reset_index()
# squad_scale = pd.concat([squad_scale, _], axis=1)

In [None]:
# _ = train.loc[train.matchType_1 == "etc", ["matchId", "matchType_1", "matchType_2", "solo", "duo", "squad", "etc", "fpp", "tpp", "winPlacePerc"]]
# _ = _.reset_index()
# etc_scale = pd.concat([etc_scale, _], axis=1)

In [None]:
# X = pd.concat([solo_scale, duo_scale, squad_scale, etc_scale])

#### 방법2-2 (게임타입별로 정규화, 게임시점기준)

In [None]:
# for i in ["boosts", "damageDealt", "heals", "killPlace", "kills", "killStreaks", "longestKill", "walkDistance", "weaponsAcquired"]:
#     for t in ["tpp", "fpp"]:
#         train.loc[train.matchType_2 == t, i] = (train.loc[train.matchType_2 == t, i] - np.min(train.loc[train.matchType_2 == t, i])) / (np.max(train.loc[train.matchType_2 == t, i]) - np.min(train.loc[train.matchType_2 == t, i]))

#### 방법3-1 (게임 별로 정규화)
Normalizaton by matchId

In [None]:
train.set_index("Id", inplace=True)
train.index.name = "Id"

In [None]:
temp_1 = train.loc[:, feature_1]
temp_2 = train.loc[:, feature_2]

In [None]:
def minmax(attr):
    if max(attr) - min(attr) == 0:
        return 0
    return (attr - min(attr)) / (max(attr) - min(attr))

In [None]:
temp_1.groupby("matchId").transform(minmax)
for i in temp_2.columns[:4]:
    temp_2[i] = (temp_2[i] - min(temp_2[i])) / (max(temp_2[i]) - min(temp_2[i]))

In [None]:
X = pd.merge(temp_1, temp_2, on="Id")
X = pd.merge(X, train.loc[:, ["matchType_1", "winPlacePerc"]], on="Id")

In [None]:
X.reset_index()

#### 방법3-2 (게임 별로 정규화, robust)

In [None]:
# for i in ["boosts", "damageDealt", "heals", "killPlace", "kills", "killStreaks", "longestKill", "walkDistance", "weaponsAcquired"]:
#     for idx, g in enumerate(pd.unique(train.matchId)):
#         train.loc[train.matchId == g, i] = robust_scale(train.loc[train.matchId == g, i])

#### 방법3-3 (게임 별로 정규화, minmax)

In [None]:
# for i in ["boosts", "damageDealt", "heals", "killPlace", "kills", "killStreaks", "longestKill", "walkDistance", "weaponsAcquired"]:
#     for idx, g in enumerate(pd.unique(train.matchId)):
#         train.loc[train.matchId == g, i] = minmax_scale(train.loc[train.matchId == g, i])

---

## 5. 가설

### 게임 타입별로 데이터 분포가 다를까? 

In [None]:
# plt.figure(figsize=(20, 20))
# plt.suptitle("Assists distribution by matchType", fontsize = 20)

# for idx, v in enumerate(pd.unique(train.matchType)):
#     plt.subplot(4, 4, idx+1)
#     plt.hist(train[train.matchType == v]["assists"], density=True)
#     plt.title(v)
    
# plt.show()

In [None]:
# plt.figure(figsize=(20, 20))
# plt.suptitle("DamageDealt distribution by matchType", fontsize = 20)

# for idx, v in enumerate(pd.unique(train.matchType)):
#     plt.subplot(4, 4, idx+1)
#     plt.hist(train[train.matchType == v]["damageDealt"], density=True)
#     plt.title(v)
    
# plt.show()

In [None]:
# plt.figure(figsize=(25, 25))
# plt.suptitle("Continuous variables distribution by matchType(fpp-tpp)", fontsize = 20)

# for idx, v in enumerate(train.columns[train.dtypes != "O"]):
#     plt.subplot(6, 5, idx+1)
#     plt.hist(train[train.matchType_2 == "fpp"][v].dropna(), color = "red", alpha = 0.5, label = "fpp", density = True, cumulative = True)
#     plt.hist(train[train.matchType_2 == "tpp"][v].dropna(), color = "grey", alpha = 0.8, label = "tpp", density = True, cumulative = True)
#     plt.legend()
#     plt.title(v)
    
# plt.show()

In [None]:
# plt.figure(figsize=(25, 25))
# plt.suptitle("Continuous variables distribution by matchType(solo-duo-squad)", fontsize = 20)

# for idx, v in enumerate(train.columns[train.dtypes != "O"]):
#     plt.subplot(6, 5, idx+1)
#     plt.hist(train[train.matchType_1 == "solo"][v].dropna(), color = "red", alpha = 0.5, label = "solo", density = True)
#     plt.hist(train[train.matchType_1 == "duo"][v].dropna(), color = "grey", alpha = 0.8, label = "duo", density = True)
#     plt.hist(train[train.matchType_1 == "squad"][v].dropna(), color = "yellow", alpha = 0.2, label = "squad", density = True)
#     plt.legend()
#     plt.title(v)
    
# plt.show()

게임 타입별로 데이터 분포가 다르지 않다.

### 같은 그룹은 같은 winPlacePerc를 부여받나?

### winPlacePerc와 가장 큰 선형 관계가 있는건 무엇일까? 

In [None]:
#train.loc[train.killPoints == 0, "killPoints"] = np.mean(train.loc[train.killPoints != 0, "killPoints"])
#train.loc[train.winPoints == 0, "winPoints"] = np.mean(train.loc[train.winPoints != 0, "winPoints"])

In [None]:
#train = train[train.killPoints != 0]
#train = train[train.winPoints != 0]

In [None]:
train.corr()

In [None]:
corre = train.corr()
pd.DataFrame(data = corre[(corre>0.35) | (corre < -0.35)]["winPlacePerc"].rename("Correlation"))

boosts, damageDealt, heals, killPlace, kills, killStreaks, longestKill, walkDistance, weaponsAcquired 이 winPlacePerc과 선형관계가 있음을 알 수 있다.

---

## 6. 예측(Prediction)

### 6-1. 모델생성(Model Generation)

게임 인원별로 다른 모델을 만들어보자 <br>
Make model by matchtype

In [None]:
print("Name: ", feature, "\nCount: ", len(feature))

In [None]:
list_feat = ["assists", "boosts", "damageDealt", "DBNOs", "headshotKills", "heals", 
             "killPlace", "killPoints", "kills", "killStreaks", "longestKill", 
             "matchDuration", "maxPlace", "rankPoints", "revives", "rideDistance", 
             "roadKills", "swimDistance", "teamKills", "vehicleDestroys", "walkDistance", 
             "weaponsAcquired", "winPoints", "player", "fpp", "tpp"]

In [None]:
list_feat_1 = ["assists", "boosts", "damageDealt", "DBNOs", "headshotKills", "heals", 
               "killPlace", "killPoints", "kills", "killStreaks", "longestKill", 
               "matchDuration", "maxPlace", "rankPoints", "revives", "rideDistance", 
               "roadKills", "swimDistance", "teamKills", "vehicleDestroys", "walkDistance", 
               "weaponsAcquired", "winPoints", "player", "fpp", "tpp", "matchId"]

In [None]:
train = X

In [None]:
# 모델 1(solo)
model_1 = keras.models.Sequential()

model_1.add(Dense(32, input_dim=len(list_feat), activation="elu", kernel_initializer="he_normal"))
model_1.add(Dense(64, activation="elu", kernel_initializer="he_normal"))
model_1.add(Dense(128, activation="elu", kernel_initializer="he_normal"))
model_1.add(keras.layers.Dropout(0.25))

model_1.add(Dense(256, activation="elu", kernel_initializer="he_normal"))
model_1.add(Dense(256, activation="elu", kernel_initializer="he_normal"))
model_1.add(keras.layers.Dropout(0.25))

model_1.add(Dense(128, activation="elu", kernel_initializer="he_normal"))
model_1.add(Dense(64, activation="elu", kernel_initializer="he_normal"))
model_1.add(Dense(32, activation="elu", kernel_initializer="he_normal"))
model_1.add(keras.layers.Dropout(0.25))

model_1.add(Dense(1, activation="sigmoid"))

model_1.compile(optimizer="RMSprop", loss='MAE', metrics=["MAE"])

In [None]:
x_train = train.loc[train.matchType_1 == "solo", list_feat]
y_train = train.loc[train.matchType_1 == "solo", ["winPlacePerc"]]

In [None]:
model_1.fit(x=x_train, y=y_train, epochs=50, batch_size=10000, validation_split=0.2, shuffle=True)
model_1.fit(x=x_train, y=y_train, epochs=30, batch_size=2000, validation_split=0.2, shuffle=True)

In [None]:
# for epoch in tqdm.tqdm(range(1, 2)):
#     for i in pd.unique(x_train.matchId):
#         model_1.fit(x=x_train.loc[x_train.matchId == i, list_feat], y=y_train.loc[y_train.matchId == i, "winPlacePerc"], batch_size=len(y_train.loc[y_train.matchId == i, "winPlacePerc"]), epochs=1, verbose=0)

In [None]:
model_1.save("model_1_solo.h5")
# model_1 = keras.models.load_model("../input/model-pubg/model_1_fpp.h5")

In [None]:
# model_1.fit(x=x_train, y=y_train, epochs=50, batch_size=100, validation_split=0.2, shuffle=True)

In [None]:
# keras.models.save_model(model_1, "model_1_fpp.h5")

In [None]:
# 모델 2(duo)
model_2 = keras.models.Sequential()

model_2.add(Dense(32, input_dim=len(list_feat), activation="elu", kernel_initializer="he_normal"))
model_2.add(Dense(64, activation="elu", kernel_initializer="he_normal"))
model_2.add(Dense(128, activation="elu", kernel_initializer="he_normal"))
model_2.add(keras.layers.Dropout(0.25))

model_2.add(Dense(256, activation="elu", kernel_initializer="he_normal"))
model_2.add(Dense(256, activation="elu", kernel_initializer="he_normal"))
model_2.add(keras.layers.Dropout(0.25))

model_2.add(Dense(128, activation="elu", kernel_initializer="he_normal"))
model_2.add(Dense(64, activation="elu", kernel_initializer="he_normal"))
model_2.add(Dense(32, activation="elu", kernel_initializer="he_normal"))
model_2.add(keras.layers.Dropout(0.25))

model_2.add(Dense(1, activation="sigmoid"))

model_2.compile(optimizer="RMSprop", loss='MAE', metrics=["MAE"])

In [None]:
x_train = train.loc[train.matchType_1 == "duo", list_feat]
y_train = train.loc[train.matchType_1 == "duo", ["winPlacePerc"]]

In [None]:
model_2.fit(x=x_train, y=y_train, epochs=50, batch_size=10000, validation_split=0.2, shuffle=True)
model_2.fit(x=x_train, y=y_train, epochs=40, batch_size=2000, validation_split=0.2, shuffle=True)

In [None]:
# for epoch in tqdm.tqdm(range(1, 2)):
#     for i in pd.unique(x_train.matchId):
#         model_2.fit(x=x_train.loc[x_train.matchId == i, list_feat], y=y_train.loc[y_train.matchId == i, "winPlacePerc"], batch_size=len(y_train.loc[y_train.matchId == i, "winPlacePerc"]), epochs=1, verbose=0)

In [None]:
model_2.save("model_2_duo.h5")
# model_1 = keras.models.load_model("../input/model-pubg/model_1_fpp.h5")

In [None]:
# 모델 3(squad)
model_3 = keras.models.Sequential()

model_3.add(Dense(32, input_dim=len(list_feat), activation="elu", kernel_initializer="he_normal"))
model_3.add(Dense(64, activation="elu", kernel_initializer="he_normal"))
model_3.add(Dense(128, activation="elu", kernel_initializer="he_normal"))
model_3.add(keras.layers.Dropout(0.25))

model_3.add(Dense(256, activation="elu", kernel_initializer="he_normal"))
model_3.add(Dense(256, activation="elu", kernel_initializer="he_normal"))
model_3.add(keras.layers.Dropout(0.35))

model_3.add(Dense(128, activation="elu", kernel_initializer="he_normal"))
model_3.add(Dense(64, activation="elu", kernel_initializer="he_normal"))
model_3.add(Dense(32, activation="elu", kernel_initializer="he_normal"))
model_3.add(keras.layers.Dropout(0.25))

model_3.add(Dense(1, activation="sigmoid"))

model_3.compile(optimizer="RMSprop", loss='MAE', metrics=["MAE"])

In [None]:
x_train = train.loc[train.matchType_1 == "squad", list_feat]
y_train = train.loc[train.matchType_1 == "squad", ["winPlacePerc"]]

In [None]:
model_3.fit(x=x_train, y=y_train, epochs=60, batch_size=10000, validation_split=0.2, shuffle=True)
model_3.fit(x=x_train, y=y_train, epochs=50, batch_size=3000, validation_split=0.2, shuffle=True)

In [None]:
# for epoch in tqdm.tqdm(range(1, 2)):
#     for i in pd.unique(x_train.matchId):
#         model_3.fit(x=x_train.loc[x_train.matchId == i, list_feat], y=y_train.loc[y_train.matchId == i, "winPlacePerc"], batch_size=len(y_train.loc[y_train.matchId == i, "winPlacePerc"]), epochs=1, verbose=0)

In [None]:
model_3.save("model_3_squad.h5")
# model_1 = keras.models.load_model("../input/model-pubg/model_1_fpp.h5")

In [None]:
# 모델 4(etc)
model_4 = keras.models.Sequential()

model_4.add(Dense(32, input_dim=len(list_feat), activation="elu", kernel_initializer="he_normal"))
model_4.add(Dense(64, activation="elu", kernel_initializer="he_normal"))
model_4.add(Dense(128, activation="elu", kernel_initializer="he_normal"))
model_4.add(keras.layers.Dropout(0.25))

model_4.add(Dense(128, activation="elu", kernel_initializer="he_normal"))
model_4.add(Dense(64, activation="elu", kernel_initializer="he_normal"))
model_4.add(Dense(32, activation="elu", kernel_initializer="he_normal"))
model_4.add(keras.layers.Dropout(0.25))

model_4.add(Dense(1, activation="sigmoid"))

model_4.compile(optimizer="RMSprop", loss='MAE', metrics=["MAE"])

In [None]:
x_train = train.loc[train.matchType_1 == "etc", list_feat]
y_train = train.loc[train.matchType_1 == "etc", ["winPlacePerc"]]

In [None]:
model_4.fit(x=x_train, y=y_train, epochs=70, batch_size=10000, validation_split=0.2, shuffle=True)
model_4.fit(x=x_train, y=y_train, epochs=150, batch_size=1000, validation_split=0.2, shuffle=True)

In [None]:
# for epoch in tqdm.tqdm(range(1, 5)):
#     for i in pd.unique(x_train.matchId):
#         model_4.fit(x=x_train.loc[x_train.matchId == i, list_feat], y=y_train.loc[y_train.matchId == i, "winPlacePerc"], batch_size=len(y_train.loc[y_train.matchId == i, "winPlacePerc"]), epochs=1, verbose=0)

In [None]:
model_4.save("model_4_etc.h5")
# model_1 = keras.models.load_model("../input/model-pubg/model_1_fpp.h5")

In [None]:
del(train, x_train, y_train, X)

### 6-2. 생성모델 평가

In [None]:
plt.figure(figsize=(30, 30))
plt.suptitle("model History", fontsize = 20)

plt.subplot(2, 2, 1)
plt.title("model_1")
plt.plot(model_1.history.history["mean_absolute_error"], label="training")
plt.plot(model_1.history.history["val_mean_absolute_error"], label="validation")
plt.axhline(0.3, c="red", linestyle="--")
plt.axhline(0.2, c="yellow", linestyle="--")
plt.axhline(0.15, c="green", linestyle="--")
plt.xticks(model_1.history.epoch)
plt.xlabel("Epoch")
plt.ylabel("MAE")
plt.legend()

plt.subplot(2, 2, 2)
plt.title("model_2")
plt.plot(model_2.history.history["mean_absolute_error"], label="training")
plt.plot(model_2.history.history["val_mean_absolute_error"], label="validation")
plt.axhline(0.3, c="red", linestyle="--")
plt.axhline(0.2, c="yellow", linestyle="--")
plt.axhline(0.15, c="green", linestyle="--")
plt.xticks(model_2.history.epoch)
plt.xlabel("Epoch")
plt.ylabel("MAE")
plt.legend()

plt.subplot(2, 2, 3)
plt.title("model_3")
plt.plot(model_3.history.history["mean_absolute_error"], label="training")
plt.plot(model_3.history.history["val_mean_absolute_error"], label="validation")
plt.axhline(0.3, c="red", linestyle="--")
plt.axhline(0.2, c="yellow", linestyle="--")
plt.axhline(0.15, c="green", linestyle="--")
plt.xticks(model_3.history.epoch)
plt.xlabel("Epoch")
plt.ylabel("MAE")
plt.legend()

plt.subplot(2, 2, 4)
plt.title("model_4")
plt.plot(model_4.history.history["mean_absolute_error"], label="training")
plt.plot(model_4.history.history["val_mean_absolute_error"], label="validation")
plt.axhline(0.3, c="red", linestyle="--")
plt.axhline(0.2, c="yellow", linestyle="--")
plt.axhline(0.15, c="green", linestyle="--")
plt.xticks(model_4.history.epoch)
plt.xlabel("Epoch")
plt.ylabel("MAE")
plt.legend()

plt.show()

### 6-3. 예측 데이터 준비

In [None]:
test = pd.read_csv("../input/test_V2.csv")

In [None]:
print("Check The NA value in test data")
for i in list(test.columns[test.dtypes != "O"]):
    print(i, ":", sum(test[i].isna()))

Game Player

In [None]:
len(pd.unique(test.matchId)), sum(test.groupby("matchId").size() < 9)

In [None]:
temp = pd.DataFrame(test.groupby("matchId").size(), columns=["player"])
temp.reset_index(level=0, inplace=True)
test = test.merge(temp, left_on="matchId", right_on="matchId")

Division matchType

In [None]:
test["matchType_1"] = "-"
test.loc[(test.matchType == "solo-fpp") | 
         (test.matchType == "solo") | 
         (test.matchType == "normal-solo-fpp") | 
         (test.matchType == "normal-solo"), "matchType_1"] = "solo"

test.loc[(test.matchType == "duo-fpp") | 
         (test.matchType == "duo") | 
         (test.matchType == "normal-duo-fpp") | 
         (test.matchType == "normal-duo"), "matchType_1"] = "duo"

test.loc[(test.matchType == "squad-fpp") | 
         (test.matchType == "squad") | 
         (test.matchType == "normal-squad-fpp") | 
         (test.matchType == "normal-squad"), "matchType_1"] = "squad"

test.loc[(test.matchType == "flarefpp") | 
         (test.matchType == "flaretpp") | 
         (test.matchType == "crashfpp") | 
         (test.matchType == "crashtpp"), "matchType_1"] = "etc"

In [None]:
test["matchType_2"] = "-"
test.loc[(test.matchType == "solo-fpp") | 
         (test.matchType == "duo-fpp") | 
         (test.matchType == "squad-fpp") | 
         (test.matchType == "normal-solo-fpp") | 
         (test.matchType == "normal-duo-fpp") | 
         (test.matchType == "normal-squad-fpp") | 
         (test.matchType == "crashfpp") | 
         (test.matchType == "flarefpp"), "matchType_2"] = "fpp"

test.loc[(test.matchType == "solo") | 
         (test.matchType == "duo") | 
         (test.matchType == "squad") | 
         (test.matchType == "normal-solo") | 
         (test.matchType == "normal-duo") | 
         (test.matchType == "normal-squad") | 
         (test.matchType == "crashtpp") | 
         (test.matchType == "flaretpp"), "matchType_2"] = "tpp"

one-hot-encoding

In [None]:
test["solo"] = 0
test["duo"] = 0
test["squad"] = 0
test["etc"] = 0

test.loc[test.matchType_1 == "solo", "solo"] = 1
test.loc[test.matchType_1 == "duo", "duo"] = 1
test.loc[test.matchType_1 == "squad", "squad"] = 1
test.loc[test.matchType_1 == "etc", "etc"] = 1

In [None]:
test["fpp"] = 0
test["tpp"] = 0

test.loc[test.matchType_2 == "fpp", "fpp"] = 1
test.loc[test.matchType_2 == "tpp", "tpp"] = 1

killPoints, winPoints

In [None]:
# for i in pd.unique(train.matchId):
#     train.loc[(train.matchId == i) & (train.killPoints == 0), "killPoints"] = np.mean(train.loc[train.matchId == i, "killPoints"])

In [None]:
# for i in pd.unique(train.matchId):
#     train.loc[(train.matchId == i) & (train.winPoints == 0), "winPoints"] = np.mean(train.loc[train.matchId == i, "winPoints"])

정규화

In [None]:
# solo_scale = solo_minmax.transform(test.loc[test.matchType_1 == "solo", feature])
# duo_scale = duo_minmax.transform(test.loc[test.matchType_1 == "duo", feature])
# squad_scale = squad_minmax.transform(test.loc[test.matchType_1 == "squad", feature])
# etc_scale = etc_minmax.transform(test.loc[test.matchType_1 == "etc", feature])

In [None]:
# solo_scale = pd.DataFrame(solo_scale, columns=feature)
# duo_scale = pd.DataFrame(duo_scale, columns=feature)
# squad_scale = pd.DataFrame(squad_scale, columns=feature)
# etc_scale = pd.DataFrame(etc_scale, columns=feature)

In [None]:
# _ = test.loc[test.matchType_1 == "solo", ["matchType_1", "matchType_2", "solo", "duo", "squad", "etc", "fpp", "tpp", "winPlacePerc", "Id"]]
# _ = _.reset_index()
# solo_scale = pd.concat([solo_scale, _], axis=1)

In [None]:
# _ = test.loc[test.matchType_1 == "duo", ["matchType_1", "matchType_2", "solo", "duo", "squad", "etc", "fpp", "tpp", "winPlacePerc", "Id"]]
# _ = _.reset_index()
# duo_scale = pd.concat([duo_scale, _], axis=1)

In [None]:
# _ = test.loc[test.matchType_1 == "squad", ["matchType_1", "matchType_2", "solo", "duo", "squad", "etc", "fpp", "tpp", "winPlacePerc", "Id"]]
# _ = _.reset_index()
# squad_scale = pd.concat([squad_scale, _], axis=1)

In [None]:
# _ = test.loc[test.matchType_1 == "etc", ["matchType_1", "matchType_2", "solo", "duo", "squad", "etc", "fpp", "tpp", "winPlacePerc", "Id"]]
# _ = _.reset_index()
# etc_scale = pd.concat([etc_scale, _], axis=1)

In [None]:
# X = pd.concat([solo_scale, duo_scale, squad_scale, etc_scale])

In [None]:
# for i in ["boosts", "damageDealt", "heals", "killPlace", "kills", "killStreaks", "longestKill", "walkDistance", "weaponsAcquired"]:
#     for t in ["tpp", "fpp"]:
#         test.loc[test.matchType_2 == t, i] = (test.loc[test.matchType_2 == t, i] - np.min(test.loc[test.matchType_2 == t, i])) / (np.max(test.loc[test.matchType_2 == t, i]) - np.min(test.loc[test.matchType_2 == t, i]))

In [None]:
test.set_index("Id", inplace=True)
test.index.name = "Id"

In [None]:
temp_1 = test.loc[:, feature_1]
temp_2 = test.loc[:, feature_2]

In [None]:
def minmax(attr):
    if max(attr) - min(attr) == 0:
        return 0
    return (attr - min(attr)) / (max(attr) - min(attr))

In [None]:
temp_1.groupby("matchId").transform(minmax)
for i in temp_2.columns[:4]:
    temp_2[i] = (temp_2[i] - min(temp_2[i])) / (max(temp_2[i]) - min(temp_2[i]))

In [None]:
X = pd.merge(temp_1, temp_2, on="Id")
X = pd.merge(X, test.loc[:, ["matchType_1", "winPlacePerc"]], on="Id")

In [None]:
X.reset_index()

In [None]:
# for i in ["boosts", "damageDealt", "heals", "killPlace", "kills", "killStreaks", "longestKill", "walkDistance", "weaponsAcquired"]:
#     for idx, g in enumerate(pd.unique(test.matchId)):
#         test.loc[test.matchId == g, i] = robust_scale(test.loc[test.matchId == g, i])

In [None]:
# for i in ["boosts", "damageDealt", "heals", "killPlace", "kills", "killStreaks", "longestKill", "walkDistance", "weaponsAcquired"]:
#     for idx, g in enumerate(pd.unique(test.matchId)):
#         test.loc[test.matchId == g, i] = minmax_scale(test.loc[test.matchId == g, i])

### 6-4. 예측실행

In [None]:
test = X
test.reset_index()

In [None]:
result_1 = model_1.predict(test.loc[test.matchType_1 == "solo", list_feat])
result_2 = model_2.predict(test.loc[test.matchType_1 == "duo", list_feat])
result_3 = model_3.predict(test.loc[test.matchType_1 == "squad", list_feat])
result_4 = model_4.predict(test.loc[test.matchType_1 == "etc", list_feat])

In [None]:
temp = pd.DataFrame(test.loc[test.matchType_1 == "solo", "Id"]).append(pd.DataFrame(test.loc[test.matchType_1 == "duo", "Id"])).append(pd.DataFrame(test.loc[test.matchType_1 == "squad", "Id"])).append(pd.DataFrame(test.loc[test.matchType_1 == "etc", "Id"]))
_ = pd.DataFrame(result_1, columns = ["winPlacePerc"]).append(pd.DataFrame(result_2, columns = ["winPlacePerc"])).append(pd.DataFrame(result_3, columns = ["winPlacePerc"])).append(pd.DataFrame(result_4, columns = ["winPlacePerc"]))

In [None]:
result = pd.concat([temp.reset_index(drop=True), _.reset_index(drop=True)], axis=1)

In [None]:
np.sum(result.winPlacePerc.isna())

In [None]:
np.sum(result.winPlacePerc < 0)

In [None]:
np.sum(result.winPlacePerc > 1)

In [None]:
result.loc[result.winPlacePerc.isna(), "winPlacePerc"] = 0
result.loc[result.winPlacePerc < 0, "winPlacePerc"] = 0
result.loc[result.winPlacePerc > 1, "winPlacePerc"] = 1
result.to_csv('submission.csv', index=False)

---

The end of notebook

winner winner chicken dinner