In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from scipy.optimize import linprog
import cvxpy as cp
import warnings
warnings.filterwarnings("ignore")
# pd.options.display.max_columns = None
# pd.options.display.max_rows = None
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from scipy.cluster.hierarchy import linkage

In [4]:
# df = pd.read_csv('player_data.csv')

# # 좌표를 분리하는 함수 정의
# def split_coordinates(trajectory):
#     if not trajectory or not isinstance(trajectory, str):  # 유효성 검사
#         return []
#     try:
#         points = trajectory.split(" -> ")
#         coordinates = [tuple(map(float, point.strip("()").split(","))) for point in points]
#         return coordinates
#     except ValueError as e:
#         # print(f"Error processing trajectory: {trajectory}, Error: {e}")
#         return []

# # 좌표를 리스트로 분리
# coordinate_list = df["movement_routes"].apply(split_coordinates)

# # 최대 좌표 개수 계산
# max_points = max(coordinate_list.apply(len))  # 최대 좌표 개수

# # 새로운 컬럼 이름 생성 (x1, y1, z1, x2, y2, z2, ...)
# column_names = []
# for i in tqdm(range(1, max_points + 1)):
#     column_names.extend([f"x{i}", f"y{i}", f"z{i}"])

# # NaN을 방지하며 데이터를 펼침
# expanded_df = pd.DataFrame(
#     coordinate_list.tolist()
# ).apply(
#     lambda row: pd.Series([v for point in row if point for v in point]),
#     axis=1
# )

# # 컬럼 이름 지정
# expanded_df.columns = column_names[:expanded_df.shape[1]]

# # 원본 데이터프레임과 병합
# result = pd.concat([df, expanded_df], axis=1)

# result.to_csv('new_player_data.csv',index=False)

In [5]:
#row data와 다름 (위 주석 코드 실행한 csv)
df = pd.read_csv('new_player_data.csv')
df.shape

(170756, 1128)

### 1. Data Preprocessing

In [6]:
#첫 번째 시작 좌표가 None 값인 행 제거
#탈퇴한 사람들 제거(피처 생성에 필요)
def explorer_data_preprocessing(df, location_col='first_location_x', name_col='player_name'):
    df = df[df[location_col] != 'None']
    df = df[~df[name_col].isnull()]
    return df

In [7]:
df = explorer_data_preprocessing(df)
df.shape

(170530, 1128)

### 2. Feature 생성

In [8]:
# #거리 합산
# df['total_distance'] = df['walk_distance']+df['ride_distance'] + df['swim_distance']
# #분당 이동 거리의 합산
# coordinate_columns = [col for col in df1.columns if col.startswith(('x', 'y', 'z'))]
# coords = df[coordinate_columns].values.reshape(len(df1), -1, 3)
# coords = np.nan_to_num(coords)
# distances = np.sqrt(np.sum(np.diff(coords, axis=1)**2, axis=2))
# df['total_movement_distance'] = distances.sum(axis=1)
# # 플레이어별로 탐험한 맵의 개수 계산
# map_diversity = df.groupby('player_name')['map_name'].nunique().reset_index()
# map_diversity.columns = ['player_name', 'unique_maps']
# df = df.merge(map_diversity[['player_name', 'unique_maps']], on='player_name', how='left')

In [9]:
def feature_generation(df):  
    # 거리 합산
    df['total_distance'] = df['walk_distance'] + df['ride_distance'] + df['swim_distance']
    
    # 분당 이동 거리 합산 계산
    coordinate_columns = [col for col in df.columns if col.startswith(('x', 'y', 'z'))]
    coords = df[coordinate_columns].values.reshape(len(df), -1, 3)
    coords = np.nan_to_num(coords)
    distances = np.sqrt(np.sum(np.diff(coords, axis=1) ** 2, axis=2))
    df['total_movement_distance'] = distances.sum(axis=1)
    
    # 플레이어별 탐험한 맵의 개수 계산
    map_diversity = df.groupby('player_name')['map_name'].nunique().reset_index()
    map_diversity.columns = ['player_name', 'unique_maps']
    df = df.merge(map_diversity[['player_name', 'unique_maps']], on='player_name', how='left')
    
    return df

In [10]:
df = feature_generation(df)

In [11]:
#결측치 확인
df[['total_distance', 'total_movement_distance', 'unique_maps']].isnull().sum()

total_distance             0
total_movement_distance    0
unique_maps                0
dtype: int64

### 3. Clustering(=Labeling)

In [12]:
cluster_col = ['total_distance','total_movement_distance','unique_maps']

In [13]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=42)

# 클러스터링 수행
df['cluster'] = kmeans.fit_predict(df[cluster_col])

df['cluster'].value_counts()

0    154771
1     15759
Name: cluster, dtype: int64

### 4. Modeling

In [14]:
df1 = df[['total_distance','total_movement_distance','unique_maps','cluster']]

In [15]:
df1.isnull().sum()

total_distance             0
total_movement_distance    0
unique_maps                0
cluster                    0
dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

X = df1[['total_distance', 'total_movement_distance', 'unique_maps']]
y = df1['cluster']

train_x, test_x, train_y, test_y = train_test_split(X,y,test_size=0.2,random_state=0)

X_train, X_val, y_train, y_val = train_test_split(train_x,train_y,test_size=0.2,random_state=0)

##### 4.1) DecisionTree

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
pred = dt.predict(X_val)
print('DecisionTreeClassifier val acc',accuracy_score(y_val, pred))
print('DecisionTreeClassifier val f1_score',f1_score(y_val, pred))

DecisionTreeClassifier val acc 1.0
DecisionTreeClassifier val f1_score 1.0


In [18]:
test_pred = dt.predict(test_x)
print('DecisionTreeClassifier test acc',accuracy_score(test_y, test_pred))
print('DecisionTreeClassifier test f1_score',f1_score(test_y, test_pred))

DecisionTreeClassifier test acc 1.0
DecisionTreeClassifier test f1_score 1.0


##### 4.2) ExtraTree

In [19]:
et = ExtraTreesClassifier()
et.fit(X_train, y_train)
pred = et.predict(X_val)
print('ExtraTreesClassifier val acc',accuracy_score(y_val, pred))
print('ExtraTreesClassifier val f1_score',f1_score(y_val, pred))

ExtraTreesClassifier val acc 0.9993402968664101
ExtraTreesClassifier val f1_score 0.9964328180737217


In [20]:
test_pred = et.predict(test_x)
print('ExtraTreesClassifier test acc',accuracy_score(test_y, test_pred))
print('ExtraTreesClassifier test f1_score',f1_score(test_y, test_pred))

ExtraTreesClassifier test acc 0.999530874332962
ExtraTreesClassifier test f1_score 0.9974984365228267


#### 4.3) RandomForest

In [21]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred = rf.predict(X_val)
print('RandomForestClassifier val acc',accuracy_score(y_val, pred))
print('RandomForestClassifier val f1_score',f1_score(y_val, pred))

RandomForestClassifier val acc 1.0
RandomForestClassifier val f1_score 1.0


In [22]:
test_pred = rf.predict(test_x)
print('RandomForestClassifier test acc',accuracy_score(test_y, test_pred))
print('RandomForestClassifier test f1_score',f1_score(test_y, test_pred))

RandomForestClassifier test acc 1.0
RandomForestClassifier test f1_score 1.0


#### 4.4) AdaBoost

In [23]:
adb = AdaBoostClassifier()
adb.fit(X_train, y_train)
pred = adb.predict(X_val)
print('AdaBoostClassifier val acc',accuracy_score(y_val, pred))
print('AdaBoostClassifier val f1_score',f1_score(y_val, pred))

AdaBoostClassifier val acc 1.0
AdaBoostClassifier val f1_score 1.0


In [24]:
test_pred = adb.predict(test_x)
print('AdaBoostClassifier test acc',accuracy_score(test_y, test_pred))
print('AdaBoostClassifier test f1_score',f1_score(test_y, test_pred))

AdaBoostClassifier test acc 1.0
AdaBoostClassifier test f1_score 1.0


#### 4.5) GradientBoost

In [25]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
pred = gb.predict(X_val)
print('GradientBoostingClassifier val acc',accuracy_score(y_val, pred))
print('GradientBoostingClassifier val f1_score',f1_score(y_val, pred))

GradientBoostingClassifier val acc 1.0
GradientBoostingClassifier val f1_score 1.0


In [26]:
test_pred = gb.predict(test_x)
print('GradientBoostingClassifier test acc',accuracy_score(test_y, test_pred))
print('GradientBoostingClassifier test f1_score',f1_score(test_y, test_pred))

GradientBoostingClassifier test acc 1.0
GradientBoostingClassifier test f1_score 1.0


- ExtraTree를 제외한 나머지 모델은 1.0  
- 아무거나 쓰면 될 듯

### 5. 최종 모델

- 랜덤포레스트 사용  
- 과적합 방지 -> 튜닝 진행 x, 기본 모델로 선정

In [27]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred = rf.predict(X_val)
print('RandomForestClassifier val acc',accuracy_score(y_val, pred))
print('RandomForestClassifier val f1_score',f1_score(y_val, pred))

RandomForestClassifier val acc 1.0
RandomForestClassifier val f1_score 1.0


In [28]:
test_pred = rf.predict(test_x)
print('RandomForestClassifier test acc',accuracy_score(test_y, test_pred))
print('RandomForestClassifier test f1_score',f1_score(test_y, test_pred))

RandomForestClassifier test acc 1.0
RandomForestClassifier test f1_score 1.0


### 6. Model save to pkl

In [33]:
import joblib
joblib.dump(rf, 'rf_explorer_model.pkl')

['rf_explorer_model.pkl']

In [31]:
model = joblib.load('rf_explorer_model.pkl')
model