In [1]:
import pandas as pd

df = pd.read_csv(r'C:\Users\inho0\OneDrive\문서\GitHub\Tobigs_PUBG\output\player_data_event_details_unique.csv')
df = df.drop_duplicates()


In [2]:
df.shape

(150379, 95)

In [10]:
data = df.copy()

#### 변수 선택

In [11]:
selected_columns = ['kills','kill_streaks', 'headshot_kills','damage_dealt', 'time_spent_in_combat_sec']
kill_data = data[selected_columns]

In [12]:
kill_data.describe()

Unnamed: 0,kills,kill_streaks,headshot_kills,damage_dealt,time_spent_in_combat_sec
count,150379.0,150379.0,150379.0,150379.0,150379.0
mean,0.987578,0.575054,0.201105,141.427901,310.569668
std,1.797257,0.79574,0.559689,201.977002,389.835765
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,3.7775
50%,0.0,0.0,0.0,87.58156,146.057
75%,1.0,1.0,0.0,196.16134,473.2895
max,37.0,10.0,13.0,5171.7666,3574.049


#### 스케일링

In [13]:
from sklearn.preprocessing import RobustScaler

# RobustScaler를 사용하여 스케일링
scaler = RobustScaler()
scaled_kill_data = scaler.fit_transform(kill_data)


In [None]:
# StandardScaler는 데이터를 평균 0, 표준편차 1로 변환하여 정규화. 이는 데이터가 정규분포를 따르는 경우에 가장 효과적
# 모든 특성이 동일한 척도로 정규화되어 kill에 대한 민감도 떨어지는 거 같다고 파악
# -> 비정규 분포에 좀더 적합하다고 판단되는 RobustScaler 사용

#### 1. kmeans 적용

In [33]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=42)
kill_data['kmeans_cluster'] = kmeans.fit_predict(scaled_kill_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kill_data['kmeans_cluster'] = kmeans.fit_predict(scaled_kill_data)


In [36]:
cluster_summary = kill_data.groupby('kmeans_cluster')[selected_columns].mean()
cluster_summary

Unnamed: 0_level_0,kills,kill_streaks,headshot_kills,damage_dealt,time_spent_in_combat_sec
kmeans_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.349602,0.295089,0.04006,72.634766,195.519314
1,3.512329,1.682996,0.838432,413.672638,765.874595


#### 2. hdbscan 적용

In [14]:
import hdbscan

# HDBSCAN 클러스터링
hdbscan_cluster = hdbscan.HDBSCAN(
    min_cluster_size=1000,
    min_samples=500,
    cluster_selection_epsilon=0.1,
    cluster_selection_method='eom'
)
labels = hdbscan_cluster.fit_predict(scaled_kill_data)

# 클러스터 수 확인
unique_labels = set(labels)
print(f"Number of clusters: {len(unique_labels) - (1 if -1 in unique_labels else 0)}")




Number of clusters: 8


In [15]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

kill_data['hdbscan_cluster'] = labels

cluster_stats = kill_data.groupby('hdbscan_cluster').agg(
    kills_mean=('kills', 'mean'),
    kills_median=('kills', 'median'),
    kills_std=('kills', 'std'),
    kill_streaks_mean=('kill_streaks', 'mean'),
    kill_streaks_median=('kill_streaks', 'median'),
    kill_streaks_std=('kill_streaks', 'std'),
    headshot_kills_mean=('headshot_kills', 'mean'),
    headshot_kills_median=('headshot_kills', 'median'),
    headshot_kills_std=('headshot_kills', 'std'),
    damage_dealt_mean=('damage_dealt', 'mean'),
    damage_dealt_median=('damage_dealt', 'median'),
    damage_dealt_std=('damage_dealt', 'std'),
    time_spent_in_combat_mean=('time_spent_in_combat_sec', 'mean'),
    time_spent_in_combat_median=('time_spent_in_combat_sec', 'median'),
    time_spent_in_combat_std=('time_spent_in_combat_sec', 'std'),
    count=('hdbscan_cluster', 'count')  # 각 클러스터 데이터 수
).reset_index()

print("[INFO] 클러스터별 변수 통계:")
print(cluster_stats)

fig = go.Figure()

for column in ['kills_mean', 'kill_streaks_mean', 'headshot_kills_mean', 'damage_dealt_mean', 'time_spent_in_combat_mean']:
    fig.add_trace(go.Bar(
        x=cluster_stats['hdbscan_cluster'],
        y=cluster_stats[column],
        name=column.replace('_mean', '').replace('_', ' ').title(),
        text=cluster_stats[column].round(2),
        textposition='auto'
    ))

fig.update_layout(
    title="클러스터별 변수 평균 비교 (HDBSCAN)",
    xaxis_title="클러스터",
    yaxis_title="평균값",
    barmode='group',
    legend_title="변수",
    template='plotly'
)

fig.show()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kill_data['hdbscan_cluster'] = labels


[INFO] 클러스터별 변수 통계:
   hdbscan_cluster  kills_mean  kills_median  kills_std  kill_streaks_mean  \
0               -1    4.574037           4.0   2.800295           1.955006   
1                0    0.000000           0.0   0.000000           0.000000   
2                1    3.000000           3.0   0.000000           2.000000   
3                2    2.000000           2.0   0.000000           2.000000   
4                3    3.000000           3.0   0.000000           1.000000   
5                4    2.000000           2.0   0.000000           1.000000   
6                5    1.000000           1.0   0.000000           1.000000   
7                6    2.000000           2.0   0.000000           1.000000   
8                7    1.000000           1.0   0.000000           1.000000   

   kill_streaks_median  kill_streaks_std  headshot_kills_mean  \
0                  2.0          0.909068             1.110561   
1                  0.0          0.000000             0.000000   
2   

In [None]:
cluster_stats
# 데이터 비율 너무 해치지 않는 선에서 나눠보기..
# 공격형(Aggressive): 군집 -1, 3, 4
# 비공격형(Non-Aggressive): 군집 5, 7
# 중립형 또는 제외(Neutral/Remove): 군집 1, 2, 6

Unnamed: 0,hdbscan_cluster,kills_mean,kills_median,kills_std,kill_streaks_mean,kill_streaks_median,kill_streaks_std,headshot_kills_mean,headshot_kills_median,headshot_kills_std,damage_dealt_mean,damage_dealt_median,damage_dealt_std,time_spent_in_combat_mean,time_spent_in_combat_median,time_spent_in_combat_std,count
0,-1,4.574037,4.0,2.800295,1.955006,2.0,0.909068,1.110561,1.0,1.067443,511.967187,445.4375,330.28687,776.61399,802.3255,444.976029,18180
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.078086,12.960001,62.651901,167.837965,29.963,278.928597,86018
2,1,3.0,3.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,291.702407,296.72485,80.480699,406.347411,333.229,291.357842,1263
3,2,2.0,2.0,0.0,2.0,2.0,0.0,0.272493,0.0,0.445306,204.769638,200.0,79.582423,273.806713,197.1825,248.652877,3490
4,3,3.0,3.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,314.627165,307.79114,87.160458,637.823231,625.15,376.70451,1633
5,4,2.0,2.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,236.660878,226.96914,86.744993,542.7109,459.4715,395.100799,3222
6,5,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,143.983808,124.442726,73.768588,375.018355,246.424,368.413621,5879
7,6,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,233.899341,220.676165,96.314128,516.347193,402.4235,406.817401,6670
8,7,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,141.639647,123.14812,82.622605,342.968568,205.4955,365.246286,24024


In [17]:
aggressive_clusters = [-1,3,4]  # 공격형 클러스터
non_aggressive_clusters = [5,7]  # 비공격형 클러스터

# 공격형 / 비공격형 라벨 추가
kill_data['aggression_label'] = kill_data['hdbscan_cluster'].apply(
    lambda x: 'Aggressive' if x in aggressive_clusters else 
              ('Non-Aggressive' if x in non_aggressive_clusters else 'Noise')
)

# 공격형 vs 비공격형 클러스터 통계
agg_stats = kill_data.groupby('aggression_label')[selected_columns].mean()

print("[INFO] 공격형 vs 비공격형 클러스터 평균 통계:")
print(agg_stats)

fig = px.bar(
    agg_stats.reset_index(),
    x='aggression_label',
    y=['kills', 'kill_streaks', 'headshot_kills', 'damage_dealt', 'time_spent_in_combat_sec'],
    barmode='group',
    title="공격형 vs 비공격형 클러스터 평균 비교",
    labels={'value': '평균값', 'variable': '변수'}
)
fig.update_layout(
    xaxis_title="클러스터 유형",
    yaxis_title="평균값",
    legend_title="변수"
)
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



[INFO] 공격형 vs 비공격형 클러스터 평균 통계:
                     kills  kill_streaks  headshot_kills  damage_dealt  \
aggression_label                                                         
Aggressive        4.102409      1.753723        1.016366    459.469111   
Noise             0.247422      0.166008        0.009760     66.036716   
Non-Aggressive    1.000000      1.000000        0.196602    142.100515   

                  time_spent_in_combat_sec  
aggression_label                            
Aggressive                      734.057833  
Noise                           198.580927  
Non-Aggressive                  349.269631  


In [18]:
kill_data['aggression_label'].value_counts()

aggression_label
Noise             97441
Non-Aggressive    29903
Aggressive        23035
Name: count, dtype: int64

#### svm 적용

In [None]:
## 일반 decision tree와 같은 지도 학습 사용시 모두 1.0으로 결과가 나와. 파악에 어려움
## svm 기반 사용

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score
import joblib

# -------------------------------------------
# 데이터 준비 및 전처리
# -------------------------------------------
selected_columns = ['kills', 'kill_streaks', 'headshot_kills', 'damage_dealt', 'time_spent_in_combat_sec', 'aggression_label']
data = kill_data[selected_columns]

# Aggressive와 Non-Aggressive 라벨만 필터링
data = data[data['aggression_label'].isin(['Aggressive', 'Non-Aggressive'])]

X = data.drop(columns=['aggression_label'])
y = data['aggression_label']
y = y.map({'Aggressive': 1, 'Non-Aggressive': 0})

# 데이터 스케일링
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test 분리 (stratify 적용)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------------------
# SVM 모델 학습 및 성능 비교
# -------------------------------------------
kernels = ['rbf', 'linear', 'poly', 'sigmoid']  
f1_scores = {}  

for kernel in kernels:
    svm_model = SVC(kernel=kernel, probability=True, random_state=42)
    svm_model.fit(X_train, y_train)

    y_pred = svm_model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    f1_scores[kernel] = f1
    
    print(f"SVM with {kernel} kernel - F1 Score: {f1:.4f}")


SVM with rbf kernel - F1 Score: 0.9989
SVM with linear kernel - F1 Score: 0.9977
SVM with poly kernel - F1 Score: 0.9976
SVM with sigmoid kernel - F1 Score: 0.9285


In [20]:
best_kernel = max(f1_scores, key=f1_scores.get)
print(f"\nBest Kernel: {best_kernel} with F1 Score: {f1_scores[best_kernel]:.4f}")

best_model = SVC(kernel=best_kernel, probability=True, random_state=42)
best_model.fit(X_train, y_train)

joblib.dump(best_model, r'C:\Users\inho0\OneDrive\문서\GitHub\Tobigs_PUBG\confer\model\killer\svm_model.pkl')
joblib.dump(scaler, r'C:\Users\inho0\OneDrive\문서\GitHub\Tobigs_PUBG\confer\model\killer\scaler.pkl')
print(f"Best SVM model with {best_kernel} kernel and scaler saved.")


Best Kernel: rbf with F1 Score: 0.9989
Best SVM model with rbf kernel and scaler saved.


#### 통계적 검정

In [21]:
from scipy.stats import shapiro, levene, ttest_ind, mannwhitneyu

aggressive_data = data[data['aggression_label'] == 'Aggressive']
non_aggressive_data = data[data['aggression_label'] == 'Non-Aggressive']

selected_columns = ['kills', 'kill_streaks', 'headshot_kills', 'damage_dealt', 'time_spent_in_combat_sec']

# -------------------------------------------
# 정규성 검정 (Shapiro-Wilk test)
# -------------------------------------------
print("[INFO] 정규성 검정 결과:")
normality_results = {}
for column in selected_columns:
    stat_agg, p_agg = shapiro(aggressive_data[column])
    stat_nonagg, p_nonagg = shapiro(non_aggressive_data[column])
    normality_results[column] = (p_agg > 0.05, p_nonagg > 0.05)
    print(f"- {column}: Aggressive(p={p_agg:.4f}), Non-Aggressive(p={p_nonagg:.4f})")

# -------------------------------------------
# 등분산성 검정 (Levene’s test)
# -------------------------------------------
print("\n[INFO] 등분산성 검정 결과:")
homogeneity_results = {}
for column in selected_columns:
    stat, p = levene(aggressive_data[column], non_aggressive_data[column])
    homogeneity_results[column] = (p > 0.05)
    print(f"- {column}: Levene-stat={stat:.4f}, p={p:.4f}")

# -------------------------------------------
# 통계적 검정 수행
# -------------------------------------------
print("\n[INFO] 통계적 검정 결과:")
for column in selected_columns:
    is_normal_agg, is_normal_nonagg = normality_results[column]
    is_homogeneous = homogeneity_results[column]

    if is_normal_agg and is_normal_nonagg:  # 정규성 만족
        if is_homogeneous:  # 등분산성 만족
            stat, p = ttest_ind(aggressive_data[column], non_aggressive_data[column], equal_var=True)
            test_name = "Independent T-test"
        else:  # 등분산성 불만족
            stat, p = ttest_ind(aggressive_data[column], non_aggressive_data[column], equal_var=False)
            test_name = "Welch's T-test"
    else:  # 정규성 불만족
        stat, p = mannwhitneyu(aggressive_data[column], non_aggressive_data[column], alternative='two-sided')
        test_name = "Mann-Whitney U Test"

    print(f"- {column}: 검정 방법={test_name}, 검정 통계량={stat:.4f}, p-value={p:.4f}")


[INFO] 정규성 검정 결과:
- kills: Aggressive(p=0.0000), Non-Aggressive(p=1.0000)
- kill_streaks: Aggressive(p=0.0000), Non-Aggressive(p=1.0000)
- headshot_kills: Aggressive(p=0.0000), Non-Aggressive(p=0.0000)
- damage_dealt: Aggressive(p=0.0000), Non-Aggressive(p=0.0000)
- time_spent_in_combat_sec: Aggressive(p=0.0000), Non-Aggressive(p=0.0000)

[INFO] 등분산성 검정 결과:
- kills: Levene-stat=13125.9624, p=0.0000
- kill_streaks: Levene-stat=36677.8344, p=0.0000
- headshot_kills: Levene-stat=6936.3495, p=0.0000
- damage_dealt: Levene-stat=8081.9900, p=0.0000
- time_spent_in_combat_sec: Levene-stat=2566.1126, p=0.0000

[INFO] 통계적 검정 결과:
- kills: 검정 방법=Mann-Whitney U Test, 검정 통계량=683717143.5000, p-value=0.0000
- kill_streaks: 검정 방법=Mann-Whitney U Test, 검정 통계량=527563677.5000, p-value=0.0000
- headshot_kills: 검정 방법=Mann-Whitney U Test, 검정 통계량=531201573.5000, p-value=0.0000
- damage_dealt: 검정 방법=Mann-Whitney U Test, 검정 통계량=636943766.5000, p-value=0.0000
- time_spent_in_combat_sec: 검정 방법=Mann-Whitney U Test


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 23035.


scipy.stats.shapiro: Input data has range zero. The results may not be accurate.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 29903.



In [22]:
import plotly.express as px
import pandas as pd

# Aggressive 그룹 데이터
aggressive_plot_data = data[data['aggression_label'] == 'Aggressive'][selected_columns]
aggressive_plot_data['Group'] = 'Aggressive'

# Non-Aggressive 그룹 데이터
non_aggressive_plot_data = data[data['aggression_label'] == 'Non-Aggressive'][selected_columns]
non_aggressive_plot_data['Group'] = 'Non-Aggressive'

plot_data = pd.concat([aggressive_plot_data, non_aggressive_plot_data])

for feature in selected_columns[:-1]:  # 'aggression_label' 제외
    fig = px.box(
        plot_data, 
        x='Group', 
        y=feature, 
        color='Group', 
        title=f"{feature} - Aggressive vs Non-Aggressive",
        labels={'Group': 'Group', feature: feature},
        width=800, 
        height=500
    )
    fig.update_traces(boxmean=True)  
    fig.show()


#### inference 부분

In [23]:
import pandas as pd
import joblib

model_path = r'C:\Users\inho0\OneDrive\문서\GitHub\Tobigs_PUBG\confer\model\killer\svm_model.pkl'
scaler_path = r'C:\Users\inho0\OneDrive\문서\GitHub\Tobigs_PUBG\confer\model\killer\scaler.pkl'

svm_model = joblib.load(model_path)
scaler = joblib.load(scaler_path)

df = pd.read_csv(r'C:\Users\inho0\OneDrive\문서\GitHub\Tobigs_PUBG\output\player_data_event_details_unique.csv')
df.drop_duplicates(inplace=True)

# 예측에 사용할 컬럼 설정 
selected_columns = ['kills', 'kill_streaks', 'headshot_kills', 'damage_dealt', 'time_spent_in_combat_sec']

In [44]:
new_data_row = df.iloc[7][selected_columns]
new_data_scaled = scaler.transform([new_data_row])
print(f"New Data (Scaled): {new_data_scaled}")

predicted_label = svm_model.predict(new_data_scaled)[0]

# 예측 확률
predicted_probabilities = svm_model.predict_proba(new_data_scaled)[0]

print(f"Predicted Label: {predicted_label}")
print(f"Prediction Probabilities: {predicted_probabilities}")

New Data (Scaled): [[ 0.5         0.          2.         -0.07719079  0.50589058]]
Predicted Label: 1
Prediction Probabilities: [3.0000009e-14 1.0000000e+00]



X does not have valid feature names, but RobustScaler was fitted with feature names



In [45]:
new_data_row 

kills                              2
kill_streaks                       1
headshot_kills                     2
damage_dealt                179.7499
time_spent_in_combat_sec      757.94
Name: 7, dtype: object