In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import polars as pl
import gc

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

# Pandas의 출력 옵션 설정
pd.set_option('display.max_columns', None)  # 모든 열을 출력
pd.set_option('display.width', None)  # 출력 너비 제한 해제

pl.Config.set_tbl_cols(None)  # 모든 열 출력
pl.Config.set_tbl_rows(None)  # 모든 행 출력

polars.config.Config

# Functions

In [3]:
# build datareading model
def read_data(num_f, num_s = 0):
    dfs = []
    for i in range(num_s,num_f):
        df = pl.read_parquet(f'datasets/train.parquet/partition_id={i}')
        dfs.append(df)

    df_total = pl.concat(dfs)
    return df_total

In [4]:
# other usrful functions

def date_and_resp(df_tmp_total):
    # responder들 가져요기
    df_tmp_resp = df_tmp_total[:,83:92]

    # 첫 번째 행의 date_id 가져오기
    date_id_value = df_tmp_total["date_id"]

    # date_id 값을 새로운 열로 추가
    df_tmp_resp = df_tmp_resp.with_columns(
        pl.lit(date_id_value).alias("date_id")  # date_id 열 추가
    )

    return df_tmp_resp

def feature_and_more(df_tmp_total, res = 'y'):
    # responder들 가져요기
    df_tmp_feat = df_tmp_total[:,4:83]

    # 정답(responder_6)가져오기
    if res == 'y':
        date_resp_value = df_tmp_total["responder_6"]  

        # date_id 값을 새로운 열로 추가
        df_tmp_feat = df_tmp_feat.with_columns(
        pl.lit(date_resp_value).alias("responder_6")  # date_id 열 추가
        )

    return df_tmp_feat

def sample_df_small(df):
    df_sampled = df.sample(n=2000000, with_replacement=False)
    return df_sampled


# Dimention reduction functions

In [5]:
def reduce_PCA(data, n_components = None, explained_variance = 0.95):

    # Preprocessing First
    transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)), #결측치는 모두 0으로 처리 사유 -> 모두 NaN인 경우 열이 삭제됨.
        ('scaler', MinMaxScaler())
    ])
    
    data_preprocessed = transformer.fit_transform(data.select(pl.col("*")).to_numpy())
    data_preprocessed = pl.DataFrame(data_preprocessed)

    if n_components is None:
        # 분산 비율 기준으로 자동 조정
        pca_model = PCA(n_components = explained_variance)
    else:
        # feature수로 축소 및 설정
        pca_model = PCA(n_components=n_components)

    # PCA 적용
    reduced_data = pca_model.fit_transform(data_preprocessed)

    print(f"원래 feature 수: {data.shape[1]}")
    print(f"줄어든 feature 수: {reduced_data.shape[1]}")
    print(f"설명된 분산 비율: {np.sum(pca_model.explained_variance_ratio_):.4f}")

    new_columns = [f"PC{i+1}" for i in range(reduced_data.shape[1])]
    reduced_data = pl.DataFrame(reduced_data, schema=new_columns)

    return reduced_data, pca_model

In [6]:
# Using TSNE

from sklearn.manifold import TSNE

def reduce_T_sne(data, n_components = None, explained_variance = 0.95):

    # Preprocessing First
    transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)), #결측치는 모두 0으로 처리 사유 -> 모두 NaN인 경우 열이 삭제됨.
        ('scaler', MinMaxScaler())
    ])
    
    data_preprocessed = transformer.fit_transform(data.select(pl.col("*")).to_numpy())
    data_preprocessed = pl.DataFrame(data_preprocessed)

    if n_components is None:
        # 분산 비율 기준으로 자동 조정
        TSNE_model = TSNE(n_components = explained_variance)
    else:
        # feature수로 축소 및 설정
        TSNE_model = TSNE(n_components = n_components)

    # PCA 적용
    reduced_data = TSNE_model.fit_transform(data_preprocessed)

    print(f"원래 feature 수: {data.shape[1]}")
    print(f"줄어든 feature 수: {reduced_data.shape[1]}")
    print(f"설명된 분산 비율: {np.sum(TSNE_model.explained_variance_ratio_):.4f}")

    new_columns = [f"PC{i+1}" for i in range(reduced_data.shape[1])]
    reduced_data = pl.DataFrame(reduced_data, schema=new_columns)

    return reduced_data, TSNE_model

In [7]:
# Using TSNE는 알고리즘상으로 계산 비중이 상당히 크다. 따라서 3백만개의 데이터가 있는 현재 데이터에는 적합하지 않을 가능성이 매우 높다.
# 따라서 해당 코드는 사용하지 않기로 한다.

def reduce_T_sne(data, n_components = None, explained_variance = 2):

    # Preprocessing First
    transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)), #결측치는 모두 0으로 처리 사유 -> 모두 NaN인 경우 열이 삭제됨.
        ('scaler', MinMaxScaler())
    ])
    
    data_preprocessed = transformer.fit_transform(data.select(pl.col("*")).to_numpy())
    data_preprocessed = pl.DataFrame(data_preprocessed)

    if n_components is None:
        # 분산 비율 기준으로 자동 조정
        TSNE_model = TSNE(n_components = explained_variance)
    else:
        # feature수로 축소 및 설정
        TSNE_model = TSNE(n_components = n_components)

    # PCA 적용
    reduced_data = TSNE_model.fit_transform(data_preprocessed)

    print(f"원래 feature 수: {data.shape[1]}")
    print(f"줄어든 feature 수: {reduced_data.shape[1]}")
    print(f"설명된 분산 비율: {np.sum(TSNE_model.explained_variance_ratio_):.4f}")

    new_columns = [f"PC{i+1}" for i in range(reduced_data.shape[1])]
    reduced_data = pl.DataFrame(reduced_data, schema=new_columns)

    return reduced_data, TSNE_model


'''
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import polars as pl
import numpy as np

def reduce_t_sne(data, n_components=2, perplexity=30, n_iter=1000):
    """
    t-SNE로 차원 축소를 수행하는 함수
    Args:
        data: Polars 데이터프레임
        n_components: 축소할 차원 수 (기본값: 2)
        perplexity: t-SNE Perplexity (기본값: 30)
        n_iter: t-SNE 반복 수 (기본값: 1000)
    Returns:
        reduced_data: 차원 축소된 데이터 (Polars 데이터프레임)
    """
    # Preprocessing
    transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('scaler', MinMaxScaler())
    ])
    
    data_preprocessed = transformer.fit_transform(data.select(pl.col("*")).to_numpy())

    # t-SNE 모델 생성 및 학습
    tsne_model = TSNE(n_components=n_components, perplexity=perplexity, n_iter=n_iter, random_state=42)
    reduced_data = tsne_model.fit_transform(data_preprocessed)

    # 결과 반환
    new_columns = [f"Dim{i+1}" for i in range(n_components)]
    reduced_data = pl.DataFrame(reduced_data, schema=new_columns)
    return reduced_data, tsne_model

# 예시 실행
# data: feature가 많은 Polars 데이터프레임
reduced_data, tsne_model = reduce_t_sne(data, n_components=2, perplexity=40, n_iter=500)
print(reduced_data)

'''

In [None]:
# using U-map



# Data_Sampling
- let's try with random sampling
- 반목문을 통해 한 번에 한 partiton을 불러 온 다음, 3만개의 rows를 랜덤 샘플 방식으로 추출해 오는 방식을 취해 보자.
- 단, 이렇게 할 경우 시계열성이 무너져서 시계열 접근방식이나 LSTM은 실행할 수 없다.

In [8]:
def sample_parquet(num_sample = 30000):
    df_all_sampled = []
    for i in range(10):
        df = pl.read_parquet(f'datasets/train.parquet/partition_id={i}')
        df_sampled = df.sample(n=num_sample, with_replacement=False)
        df_all_sampled.append(df_sampled)
        print(f"partition{i} sampled!")

    df_all = pl.concat(df_all_sampled)
    return df_all

In [9]:
df_sampled = sample_parquet(num_sample=300000)

partition0 sampled!
partition1 sampled!
partition2 sampled!
partition3 sampled!
partition4 sampled!
partition5 sampled!
partition6 sampled!
partition7 sampled!
partition8 sampled!
partition9 sampled!


In [10]:
# 데이터를 responder_6와 피쳐로 분리리

df_sampled_feat = feature_and_more(df_sampled)
resp_6 = df_sampled_feat.select('responder_6')
df_sampled_feat_olny = df_sampled_feat.drop('responder_6')

# Data Sampled By Time Series

In [11]:
def sampled_by_TS(moment1, moment2):
    pass

# PCA_data

## PCA

In [12]:
df_pca, pca_model = reduce_PCA(df_sampled_feat_olny)

원래 feature 수: 79
줄어든 feature 수: 28
설명된 분산 비율: 0.9542


In [13]:
df_pca_resp_6 = df_pca.with_columns(resp_6)
df_pca_resp_6

PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,responder_6
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32
-0.356643,-0.067905,0.209008,0.103273,0.108563,0.147282,-0.281857,0.322359,-0.115775,0.022237,0.040494,0.022748,0.153309,0.109429,-0.110651,-0.083268,0.100306,-0.029265,0.039157,0.056828,-0.155762,0.019608,0.063388,-0.011141,-0.052983,0.067132,0.014979,0.034129,0.119941
-0.352777,-0.175736,0.199725,-0.160107,0.320231,0.02052,-0.195037,-0.03186,0.018312,0.079096,-0.04025,0.349851,0.025613,0.022328,-0.019107,-0.035606,0.081854,-0.108289,-0.096115,-0.088518,-0.081819,0.031042,0.094518,0.042818,0.134221,0.050864,-0.040126,0.030615,-0.291114
-0.376572,-0.275761,0.074977,-0.070353,0.312885,-0.005162,0.178556,0.083469,0.044895,0.053178,-0.062003,0.105017,-0.056184,-0.002577,0.139597,-0.16174,0.06158,-0.038428,-0.132241,-0.0197,-0.02542,0.201979,0.085105,0.02468,0.162006,0.041641,-0.047091,0.000265,0.754146
-0.389938,-0.142194,-0.142429,0.207952,0.404282,-0.147955,-0.22875,0.158666,0.008201,-0.159175,-0.065407,0.242136,-0.07733,0.030689,0.054386,-0.044805,-0.031252,-0.053644,0.002184,-0.098723,0.152904,0.06854,0.133857,-0.029617,0.073803,0.070891,-0.038996,0.008772,-0.014246
-0.386075,-0.137231,-0.086615,0.329983,0.502147,-0.000287,-0.342556,-0.278396,0.149734,0.149537,0.202795,-0.193702,-0.014322,-0.137535,0.130038,-0.119676,-0.183665,-0.114191,0.0962,0.042946,0.092525,0.122085,0.126193,-0.056175,0.079443,0.10576,-0.033365,0.055994,-0.712278
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
-0.40783,-0.038691,-0.319135,0.503025,0.331767,0.341429,0.122802,-0.039321,0.011612,-0.1035,-0.024301,0.015979,-0.015552,-0.011253,0.01321,0.106313,-0.122896,-0.021341,0.046979,0.125806,0.066172,-0.095336,-0.154027,-0.037182,0.01663,0.064422,0.017412,0.023812,0.024239
-0.360587,0.687089,-0.442246,0.26192,-0.222201,0.071184,0.037852,0.175925,-0.081094,0.030677,-0.039238,-0.201755,0.026403,0.029595,0.022514,-0.171818,0.014152,0.10307,-0.039188,0.168388,0.090365,-0.058637,-0.043008,0.074713,0.004879,0.028209,0.037013,0.126674,-0.222975
-0.181074,0.373918,-0.366184,0.311654,-0.156321,0.049019,-0.297717,-0.052828,-0.077204,-0.043217,-0.028979,0.048437,0.036715,0.16044,0.202134,-0.063142,-0.172036,0.042926,0.062896,-0.074404,0.045129,0.03166,-0.013172,0.021488,0.036819,-0.011214,-0.024619,0.019688,0.021371
-0.327236,0.12627,0.229138,-0.39666,-0.000297,-0.299997,0.359684,-0.051906,0.089231,-0.136582,0.11652,-0.042038,0.088749,-0.021824,0.158958,0.09318,0.076699,-0.188767,0.021977,0.098728,-0.043276,-0.033572,0.099314,0.018685,0.056962,0.057293,-0.084113,-0.010024,-0.429312


## T-sne

In [14]:
# df_tsne, tsne_model = reduce_T_sne(df_sampled_feat_olny)

In [None]:
# df_tsne_resp_6 = df_tsne.with_columns(resp_6)
# df_tsne_resp_6