In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression as lr
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from sklearn.ensemble import AdaBoostClassifier as ab
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as knn

In [None]:
## Read data
agg = pd.read_csv("agg.csv")
agg = agg[agg["player_survive_time"] != 0]

In [None]:
# len(agg.match_id.drop_duplicates())
# 729969
sample_match = np.random.choice(agg.match_id.drop_duplicates(), size = 730)

In [None]:
## fetch samples
agg_sample = agg[agg.match_id.isin(sample_match)]
agg_sample.drop(columns="Unnamed: 0", inplace = True)

In [None]:
## Compute average ride and walk speed
agg_sample["avg_ride_speed"] = 0
agg_sample["avg_walk_speed"] = 0
agg_sample.iloc[: ,10] = agg_sample.iloc[:,[2, 7]].apply(lambda x: x["player_dist_ride"]/x["player_survive_time"], axis=1)
agg_sample.iloc[: ,11] = agg_sample.iloc[:,[3, 7]].apply(lambda x: x["player_dist_walk"]/x["player_survive_time"], axis=1)

In [None]:
## Split the sample into three categories
agg_sample_p1 = agg_sample[agg_sample["party_size"] == 1]
agg_sample_p2 = agg_sample[agg_sample["party_size"] == 2]
agg_sample_p4 = agg_sample[agg_sample["party_size"] == 4]

In [None]:
## Change Y into 3 classes
def triple(x):
    x = int(x)
    if x == 1:
        return 0
    elif x<=10:
        return 1
    else:
        return 2

In [None]:
X1 = agg_sample_p1[["avg_ride_speed", "avg_walk_speed", "player_dmg", "player_kills"]].values
y1 = agg_sample_p1[["team_placement"]]
y1 = list(map(triple, np.array(y1)))
## use somteenn to deal with imbalanced problem and split data into training set and testing set
sm = SMOTEENN()
X1_r, y1_r = sm.fit_sample(X1, y1)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1_r, y1_r, test_size=0.3, random_state=42)

In [None]:
rf_model_1 = rf(n_estimators=100, max_depth=100, criterion="entropy", oob_score=True, n_jobs=-1, max_features="auto").fit(X1_train,y1_train)
print(rf_model_1.oob_score_, rf_model_1.feature_importances_)
print(np.mean(rf_model_1.predict(X1_test) == y1_test)*100)

In [None]:
X2 = agg_sample_p2[["avg_ride_speed", "avg_walk_speed", "player_dmg", "player_kills"]].values
y2 = agg_sample_p2[["team_placement"]]
y2 = list(map(triple, np.array(y2)))
## use somteenn to deal with imbalanced problem and split data into training set and testing set
sm = SMOTEENN()
X2_r, y2_r = sm.fit_sample(X2, y2)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_r, y2_r, test_size=0.3, random_state=42)

In [None]:
rf_model_2 = rf(n_estimators=100, max_depth=100, criterion="entropy", oob_score=True, n_jobs=-1, max_features="auto").fit(X2_train,y2_train)
print(rf_model_2.oob_score_, rf_model_2.feature_importances_)
print(np.mean(rf_model_2.predict(X2_test) == y2_test)*100)

In [None]:
X4 = agg_sample_p4[["avg_ride_speed", "avg_walk_speed", "player_dmg", "player_kills"]].values
y4 = agg_sample_p4[["team_placement"]]
y4 = list(map(triple, np.array(y4)))
## use somteenn to deal with imbalanced problem and split data into training set and testing set
sm = SMOTEENN()
X4_r, y4_r = sm.fit_sample(X4, y4)
X4_train, X4_test, y4_train, y4_test = train_test_split(X4_r, y4_r, test_size=0.3, random_state=42)

In [None]:
rf_model_4 = rf(n_estimators=100, max_depth=100, criterion="entropy", oob_score=True, n_jobs=-1, max_features="auto").fit(X4_train,y4_train)
print(rf_model_4.oob_score_, rf_model_4.feature_importances_)
print(np.mean(rf_model_4.predict(X4_test) == y4_test)*100)

In [None]:
base_model1 = DecisionTreeClassifier(max_depth=10)

In [None]:
ab_model1 = ab(base_model1, n_estimators=100, learning_rate = 0.5).fit(X1_train, y1_train)
print(np.mean((ab_model1.predict(X1_train)==y1_train)*100), ab_model1.feature_importances_)
print(np.mean((ab_model1.predict(X1_test)==y1_test) * 100))

In [None]:
ab_model2 = ab(base_model1, n_estimators=100, learning_rate = 0.5).fit(X2_train, y2_train)
print(np.mean((ab_model2.predict(X2_train)==y2_train)*100), ab_model2.feature_importances_)
print(np.mean((ab_model2.predict(X2_test)==y2_test)*100))

In [None]:
ab_model4 = ab(base_model1, n_estimators=100, learning_rate = 0.5).fit(X4_train, y4_train)
print(np.mean((ab_model4.predict(X4_train)==y4_train)*100), ab_model4.feature_importances_)
print(np.mean((ab_model4.predict(X4_test)==y4_test)*100))

In [None]:
knn_model = knn(n_neighbors=5, algorithm="auto", n_jobs=-1)

In [None]:
knn_model.fit(X1_train, y1_train)
np.mean(knn_model.predict(X1_train) == y1_train)*100, np.mean(knn_model.predict(X1_test) == y1_test)*100

In [None]:
knn_model.fit(X2_train, y2_train)
np.mean(knn_model.predict(X2_train) == y2_train)*100, np.mean(knn_model.predict(X2_test) == y2_test)*100

In [None]:
knn_model.fit(X4_train, y4_train)
np.mean(knn_model.predict(X4_train) == y4_train)*100, np.mean(knn_model.predict(X4_test) == y4_test)*100

In [None]:
from sklearn.linear_model import LogisticRegression as logr

In [None]:
logr_model1 = logr().fit(X1_train, y1_train)
np.mean(logr_model1.predict(X1_train) == y1_train)*100, np.mean(logr_model1.predict(X1_test) == y1_test)*100

In [None]:
logr_model2 = logr().fit(X2_train, y2_train)
np.mean(logr_model2.predict(X2_train) == y2_train)*100, np.mean(logr_model2.predict(X2_test) == y2_test)*100

In [None]:
logr_model4 = logr().fit(X4_train, y4_train)
np.mean(logr_model4.predict(X4_train) == y4_train)*100, np.mean(logr_model4.predict(X4_test) == y4_test)*100

In [None]:
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_p1 = pd.DataFrame({"rank":y1_r, "avg_ride_speed":X1_r[:,0], "avg_walk_speed":X1_r[:,1], "player_dmg":X1_r[:,2],
                      "player_kills":X1_r[:,3]})
df_p2 = pd.DataFrame({"rank":y2_r, "avg_ride_speed":X2_r[:,0], "avg_walk_speed":X2_r[:,1], "player_dmg":X2_r[:,2],
                      "player_kills":X2_r[:,3]})
df_p4 = pd.DataFrame({"rank":y4_r, "avg_ride_speed":X4_r[:,0], "avg_walk_speed":X4_r[:,1], "player_dmg":X4_r[:,2],
                      "player_kills":X4_r[:,3]})

In [None]:
def plots(df, party_size):
    fig, axes = plt.subplots(2,2,figsize=(12,8), dpi=120, sharex=True)
    sns.boxplot(x="rank", y = "player_kills", data = df, ax=axes[0,0]) 
   
    axes[0,0].set_title('Boxplot of rank vs player kills (party size %s games)' %party_size, fontdict={'size':8})
    
    sns.boxplot(x="rank", y = "player_dmg", data = df, ax=axes[0,1]) 
    axes[0,1].set_title('Boxplot of rank vs player damage (party size %s games)' %party_size, fontdict={'size':8})
    
    sns.boxplot(x="rank", y = "avg_walk_speed", data = df, ax=axes[1,0]) 
    axes[1,0].set_title('Boxplot of rank vs player average walk speed (party size %s games)' %party_size, fontdict={'size':8})
    
    sns.boxplot(x="rank", y = "avg_ride_speed", data = df, ax=axes[1,1]) 
    axes[1,1].set_title('Boxplot of rank vs player average ride speed (party size %s games)' %party_size, fontdict={'size':8})
    axes[1,1].set_xticklabels(["1st", "2nd-10th", "11st-"])
    
    #fig.subplots_adjust(left=-0.1, bottom=-0.5)
    fig.savefig("../agg_plot/p%s.png" %party_size)  

In [None]:
plots(df_p1, 1)

In [None]:
plots(df_p2, 2)

In [None]:
df_p2_nc = df_p2[(df_p2["avg_walk_speed"] < 10) & (df_p2["avg_ride_speed"] <40)]
plots(df_p2_nc, 2)

In [None]:
plots(df_p4, 4)

In [None]:
df_p4_nc = df_p4[(df_p4["avg_walk_speed"] < 10) & (df_p4["avg_ride_speed"] <40)]
plots(df_p4_nc, 4)