In [2]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [4]:
# csv to parquet: 메모리에 효율적인 데이터 유형을 사용하여 용량 ↓, 빠른 작업
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [7]:
csv_to_parquet('C:/Users/juyeon/machine_learning/Dacon/train.csv', 'train')
csv_to_parquet('C:/Users/juyeon/machine_learning/Dacon/test.csv', 'test')

train Done.
test Done.


In [9]:
# Data Load
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('C:/Users/juyeon/machine_learning/Dacon/sample_submission.csv', index_col = 0)

In [10]:
# Data Pre-Processing
# 레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]  # 최빈값 시리즈의 첫 번째 인덱스 추출
    train[col] = train[col].fillna(mode)  # 최빈값으로 누락된 값 대체
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [11]:
# 질적 변수들을 수치화
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()  # 문자열 변수를 정수로 인코딩
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [12]:
# 레이블이 없는 데이터들을 제거
train = train.dropna()

In [13]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


In [14]:
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

In [15]:
# Classification Model Fit
clf = RandomForestClassifier()
clf.fit(train_x, train_y)

In [16]:
# Inference
y_pred = clf.predict_proba(test_x)

In [17]:
# Submit
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('baseline_submission.csv', index=True)