In [None]:
import pandas as pd
from google.cloud import bigquery
from google.cloud.bigquery import job
from datetime import date, timedelta
# 판다스 엑셀 익스포트를 위한 라이브러리
import openpyxl


PROJECT = "ballosodeuk"
bq = bigquery.Client(project=PROJECT)


query = """
select a.*, survival_prob, is_churn, current_shoji, frequency, recency, monetary, grade, loyalty_segment, days_since_last_activity
from
  (
    select * from `dm.agg_user_category_rank`
    where power_1_2 is not null
  ) a
left join
  (
    select user_id, survival_prob, is_churn, loyalty_segment, days_since_last_activity
    from `dm.agg_user_churn_daily`
    where cur_date = current_date() - 1
  ) b on a.user_id = b.user_id
left join 
  (
    select *
    from `dm.agg_user_rfm_daily`
    where register_dt = current_date() -1
  ) c on a.user_id = c.user_id
"""

df = bq.query(query).to_dataframe()

In [94]:
# df.to_excel('seg_power.xlsx', engine='openpyxl', encoding='utf-8-sig', index=False)

In [None]:
df.is_churn.value_counts()

In [None]:
df.info()

In [None]:
# 전처리 및 null 처리
import re
import datetime
today = datetime.datetime.now().date()

df.latest_order_dt = pd.to_datetime(df.latest_order_dt)

df['latest_order_dt'] = (today - df.latest_order_dt.dt.date).dt.days
df.info()


power_cols = [col for col in df.columns.tolist() if col.startswith('power')]
ranking_cols = [col for col in df.columns.tolist() if col.startswith('ranking')]
df[power_cols] = df[power_cols].fillna(0)
df[power_cols] = df[power_cols].astype(float)

df['power_1_2'] = df['power_1_2'].fillna(0)
df['power_1_2'] = df['power_1_2'].astype(float)

df['current_shoji'] = df['current_shoji'].fillna(0)
df['current_shoji'] = df['current_shoji'].astype(float)

df['frequency'] = df['frequency'].fillna(0)
df['frequency'] = df['frequency'].astype(float)

df['is_churn'] = df['is_churn'].fillna(True)

df['recency'] = df['recency'].fillna(0)
df['recency'] = df['recency'].astype(float)

df['monetary'] = df['monetary'].fillna(0)
df['monetary'] = df['monetary'].astype(float)

df['grade'] = df['grade'].fillna("DEAD")
df['grade'] = df['grade'].astype(str)

df['current_shoji'] = df['current_shoji'].fillna(0)
df['current_shoji'] = df['current_shoji'].astype(float)

df['survival_prob'] = df['survival_prob'].fillna(0)
df['survival_prob'] = df['survival_prob'].astype(float)

df['loyalty_segment'] = df['loyalty_segment'].fillna("DEAD")
df['loyalty_segment'] = df['loyalty_segment'].astype(str)

df['days_since_last_activity'] = df['days_since_last_activity'].fillna(9999)
df['days_since_last_activity'] = df['days_since_last_activity'].astype(float)



In [101]:
# 스파스 인코딩

power_cols = [col for col in df.columns.tolist() if col.startswith('power')]
ranking_cols = [col for col in df.columns.tolist() if col.startswith('ranking')]
tst = df[ranking_cols+power_cols].copy()

pivot_dfs = []

# ranking_1_x와 power_1_x 처리
for i in range(1, 6):  
    # NaN 값을 피벗 전에 처리
    temp_df = tst[[f'ranking_1_{i}', f'power_1_{i}']].copy()
    temp_df[f'ranking_1_{i}'] = temp_df[f'ranking_1_{i}'].fillna('unknown')  # 또는 다른 값으로 대체
    
    pivot_df = temp_df.melt(
        id_vars=[f'ranking_1_{i}'], 
        value_vars=[f'power_1_{i}']
    ).reset_index()\
    .pivot(
        index='index', 
        columns=f'ranking_1_{i}', 
        values='value'
    ).fillna(0)
    
    # unknown 컬럼 제거 (필요한 경우)
    if 'unknown' in pivot_df.columns:
        pivot_df = pivot_df.drop('unknown', axis=1)
    
    pivot_df.columns = [f'{col}_1_{i}' for col in pivot_df.columns]
    pivot_dfs.append(pivot_df)

# ranking_2_x와 power_2_x 처리도 동일하게 수정
for i in range(1, 11):
    temp_df = tst[[f'ranking_2_{i}', f'power_2_{i}']].copy()
    temp_df[f'ranking_2_{i}'] = temp_df[f'ranking_2_{i}'].fillna('unknown')
    
    pivot_df = temp_df.melt(
        id_vars=[f'ranking_2_{i}'], 
        value_vars=[f'power_2_{i}']
    ).reset_index()\
    .pivot(
        index='index', 
        columns=f'ranking_2_{i}', 
        values='value'
    ).fillna(0)
    
    if 'unknown' in pivot_df.columns:
        pivot_df = pivot_df.drop('unknown', axis=1)
        
    pivot_df.columns = [f'{col}_2_{i}' for col in pivot_df.columns]
    pivot_dfs.append(pivot_df)

# 모든 피벗 테이블을 index를 기준으로 결합
final_df = pd.concat(pivot_dfs, axis=1)

In [None]:
final_df

In [69]:
df_list = df[['depth1_list','depth1plus_list', 'depth3_list']]
df_num = df[['user_id','latest_order_dt', 'total_purchase_count','survival_prob', 'is_churn',
       'current_shoji', 'frequency', 'recency', 'monetary', 'grade',
       'loyalty_segment', 'days_since_last_activity']]



In [None]:
df_num_sp = df_num.merge(final_df, left_index=True, right_index=True)
df_num_sp.grade.value_counts()

In [None]:
df_num_sp = df_num_sp.query("grade != 'DEAD'")
df_num_sp.head()

## 모델

In [88]:
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import LabelEncoder

# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

pd.options.display.max_columns = None





In [None]:
# le = LabelEncoder()
# sc = StandardScaler()

# grade_order = {
#     'IRON': 0,    # 가장 낮은 등급
#     'SILVER': 1,
#     'GOLD': 2,
#     'VIP': 3,     # 가장 높은 등급
# }

# # 순서대로 매핑
# df_num_sp['grade_encoded'] = df_num_sp['grade'].map(grade_order)

df_num_sp['is_churn'] = df_num_sp.is_churn.map({True: 1, False: 0})

In [None]:

df_num_sp

In [None]:
df_num_sp.nan_1_4.value_counts()