In [63]:
import pandas as pd
import numpy as np
import os
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from catboost import CatBoostClassifier

# Seed 고정 함수
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

# CSV를 Parquet으로 변환하여 메모리 효율성 증대
# def csv_to_parquet(csv_path, save_name):
#     df = pd.read_csv(csv_path)
#     df.to_parquet(f'./{save_name}.parquet')
#     del df
#     gc.collect()
#     print(save_name, 'Done.')

# 데이터 변환
# csv_to_parquet('./train.csv', 'train')
# csv_to_parquet('./test.csv', 'test')

# 데이터 로드
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

In [64]:
# 데이터 전처리
# Carrier_Code(IATA) → Airline 복구
airline_mapping = train[['Carrier_Code(IATA)', 'Airline']].dropna().drop_duplicates().set_index('Carrier_Code(IATA)')['Airline'].to_dict()
train['Airline'] = train['Carrier_Code(IATA)'].map(airline_mapping).fillna(train['Airline'])
test['Airline'] = test['Carrier_Code(IATA)'].map(airline_mapping).fillna(test['Airline'])

# 변환 결과 확인
print("Airline 복구 후 결측치 수:")
print("Train Airline 결측치:", train['Airline'].isna().sum())
print("Test Airline 결측치:", test['Airline'].isna().sum())

print("Airline Mapping Dictionary:")
print(pd.Series(airline_mapping))


Airline 복구 후 결측치 수:
Train Airline 결측치: 11864
Test Airline 결측치: 26893
Airline Mapping Dictionary:
WN      Southwest Airlines Co.
UA                    Cape Air
AA       Trans States Airlines
DL    ExpressJet Airlines Inc.
AS      Peninsula Airways Inc.
B6             JetBlue Airways
NK            Spirit Air Lines
F9      Frontier Airlines Inc.
HA        Empire Airlines Inc.
G4               Allegiant Air
VX              Virgin America
dtype: object


In [65]:
# Airline → Carrier_ID(DOT) 복구
carrier_mapping = train[['Airline', 'Carrier_ID(DOT)']].dropna().drop_duplicates()
carrier_mapping = carrier_mapping.groupby('Airline').first()  # Airline 기준으로 첫 번째 Carrier_ID(DOT)를 선택

# Carrier_ID(DOT) 복구
train['Carrier_ID(DOT)'] = train['Airline'].map(carrier_mapping['Carrier_ID(DOT)']).fillna(train['Carrier_ID(DOT)'])
test['Carrier_ID(DOT)'] = test['Airline'].map(carrier_mapping['Carrier_ID(DOT)']).fillna(test['Carrier_ID(DOT)'])

# 변환 결과 확인
print("Carrier_ID(DOT) 복구 후 결측치 수:")
print("Train Carrier_ID(DOT) 결측치:", train['Carrier_ID(DOT)'].isna().sum())
print("Test Carrier_ID(DOT) 결측치:", test['Carrier_ID(DOT)'].isna().sum())

print("Carrier Mapping Dictionary:")
print(carrier_mapping)

Carrier_ID(DOT) 복구 후 결측치 수:
Train Carrier_ID(DOT) 결측치: 1273
Test Carrier_ID(DOT) 결측치: 2933
Carrier Mapping Dictionary:
                                           Carrier_ID(DOT)
Airline                                                   
Air Wisconsin Airlines Corp                        20046.0
Alaska Airlines Inc.                               19930.0
Allegiant Air                                      20368.0
American Airlines Inc.                             19805.0
Cape Air                                           20304.0
Capital Cargo International                        20427.0
Comair Inc.                                        20397.0
Commutair Aka Champlain Enterprises, Inc.          20445.0
Compass Airlines                                   21167.0
Delta Air Lines Inc.                               19790.0
Empire Airlines Inc.                               19690.0
Endeavor Air Inc.                                  20363.0
Envoy Air                                          2039

In [66]:
# K-Means를 활용한 EDT/EAT 복구
features_for_kmeans = ['Origin_Airport_ID', 'Destination_Airport_ID', 'Month', 'Day_of_Month', 'Distance']

kmeans_data = train.dropna(subset=['Estimated_Departure_Time', 'Estimated_Arrival_Time'])[features_for_kmeans].copy()
kmeans_target = train.dropna(subset=['Estimated_Departure_Time', 'Estimated_Arrival_Time']).copy()

kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(kmeans_data)

# 클러스터와 평균 비행 시간 계산
kmeans_target['Cluster'] = kmeans.predict(kmeans_data)
kmeans_target['Flight_Time'] = (
    kmeans_target['Estimated_Arrival_Time'] - kmeans_target['Estimated_Departure_Time']
) % 1440
cluster_time_mapping = kmeans_target.groupby('Cluster')['Flight_Time'].mean().to_dict()

# 복구 함수
def recover_time(row, col):
    if pd.isna(row['Estimated_Departure_Time']) or pd.isna(row['Estimated_Arrival_Time']):
        cluster = kmeans.predict(pd.DataFrame([row[features_for_kmeans].values], columns=features_for_kmeans))[0]
        avg_time = cluster_time_mapping.get(cluster, np.nan)
        if col == 'Estimated_Departure_Time' and not pd.isna(row['Estimated_Arrival_Time']):
            return (row['Estimated_Arrival_Time'] - avg_time) % 1440
        elif col == 'Estimated_Arrival_Time' and not pd.isna(row['Estimated_Departure_Time']):
            return (row['Estimated_Departure_Time'] + avg_time) % 1440
    return row[col]

# EDT와 EAT 복구
for col in ['Estimated_Departure_Time', 'Estimated_Arrival_Time']:
    train[col] = train.apply(lambda row: recover_time(row, col), axis=1)
    test[col] = test.apply(lambda row: recover_time(row, col), axis=1)


In [67]:
# 불필요한 특성 제거
columns_to_drop = ['Cancelled', 'Diverted', 'Origin_Airport', 'Destination_Airport', 'Carrier_Code(IATA)', 'Airline', 'Origin_State', 'Destination_State']
train = train.drop(columns=columns_to_drop, errors='ignore')
test = test.drop(columns=columns_to_drop, errors='ignore')

In [68]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  988312 non-null   float64
 4   Estimated_Arrival_Time    988312 non-null   float64
 5   Origin_Airport_ID         1000000 non-null  int64  
 6   Destination_Airport_ID    1000000 non-null  int64  
 7   Distance                  1000000 non-null  float64
 8   Carrier_ID(DOT)           998727 non-null   float64
 9   Tail_Number               1000000 non-null  object 
 10  Delay                     255001 non-null   object 
dtypes: float64(4), int64(4), object(3)
memory usage: 83.9+ MB


In [69]:
# 남은 결측치 처리
numeric_cols = ['Estimated_Departure_Time', 'Estimated_Arrival_Time']
train[numeric_cols] = train[numeric_cols].fillna(train[numeric_cols].mean())
test[numeric_cols] = test[numeric_cols].fillna(test[numeric_cols].mean())

train['Carrier_ID(DOT)'] = train['Carrier_ID(DOT)'].fillna(train['Carrier_ID(DOT)'].mode()[0])
test['Carrier_ID(DOT)'] = test['Carrier_ID(DOT)'].fillna(train['Carrier_ID(DOT)'].mode()[0])

In [70]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  1000000 non-null  float64
 4   Estimated_Arrival_Time    1000000 non-null  float64
 5   Origin_Airport_ID         1000000 non-null  int64  
 6   Destination_Airport_ID    1000000 non-null  int64  
 7   Distance                  1000000 non-null  float64
 8   Carrier_ID(DOT)           1000000 non-null  float64
 9   Tail_Number               1000000 non-null  object 
 10  Delay                     255001 non-null   object 
dtypes: float64(4), int64(4), object(3)
memory usage: 83.9+ MB


In [71]:
labeled_data = train[~train['Delay'].isnull()]
unlabeled_data = train[train['Delay'].isnull()]

X_labeled = labeled_data.drop(columns=['Delay'])
y_labeled = labeled_data['Delay']
X_unlabeled = unlabeled_data.drop(columns=['Delay'])

# 범주형 데이터 타입 변환 (cat_features는 정수형 및 문자열 허용)
for col in ['ID', 'Tail_Number', 'Carrier_ID(DOT)']:
    X_labeled[col] = X_labeled[col].astype(str)
    X_unlabeled[col] = X_unlabeled[col].astype(str)
    test[col] = test[col].astype(str)

le_delay = LabelEncoder()
y_labeled = le_delay.fit_transform(y_labeled)

# 클래스 간 불균형 문제 해결을 위해 가중치 부여
class_counts = pd.Series(y_labeled).value_counts()
balanced_weight = [1.0, 1.0]  # 균등 가중치

cat_features = ['ID', 'Tail_Number', 'Carrier_ID(DOT)', 'Origin_Airport_ID', 'Destination_Airport_ID']
teacher_model = CatBoostClassifier(
    iterations=500, learning_rate=0.05, depth=6, cat_features=cat_features,
    loss_function='MultiClass', class_weights=balanced_weight,
    random_seed=42, verbose=100
)
student_model = CatBoostClassifier(
    iterations=500, learning_rate=0.05, depth=6, cat_features=cat_features,
    loss_function='MultiClass', class_weights=balanced_weight,
    random_seed=42, verbose=100
)

# Teacher 모델 초기 학습
teacher_model.fit(X_labeled, y_labeled)

# Pseudo-labeling 및 Meta Pseudo Labeling 과정
max_iter = 5
confidence_threshold = 0.9  # 임계값 높임
for iteration in range(max_iter):
    print(f"\nIteration {iteration + 1}")
    
    # Teacher가 pseudo-label 생성
    pseudo_probs = teacher_model.predict_proba(X_unlabeled)
    pseudo_labels = np.argmax(pseudo_probs, axis=1)
    
    # Confidence Threshold 적용
    high_confidence_indices = np.max(pseudo_probs, axis=1) > confidence_threshold
    print(f"High confidence pseudo-labels: {high_confidence_indices.sum()}")
    if high_confidence_indices.sum() == 0:
        print("No high-confidence samples in this iteration.")
        break
    
    # 레이블 있는 데이터 + high-confidence pseudo label 데이터를 결합하여 Student 학습
    X_combined = pd.concat([X_labeled, X_unlabeled.iloc[high_confidence_indices]], axis=0)
    y_combined = np.concatenate([y_labeled, pseudo_labels[high_confidence_indices]])
    student_model.fit(X_combined, y_combined)
    
    # Teacher 업데이트
    teacher_model = student_model

0:	learn: 0.6721048	total: 221ms	remaining: 1m 50s
100:	learn: 0.4448594	total: 18.3s	remaining: 1m 12s
200:	learn: 0.4414332	total: 37.6s	remaining: 55.9s
300:	learn: 0.4393413	total: 55.8s	remaining: 36.9s
400:	learn: 0.4376048	total: 1m 15s	remaining: 18.6s
499:	learn: 0.4362768	total: 1m 34s	remaining: 0us

Iteration 1
High confidence pseudo-labels: 110049
0:	learn: 0.6641374	total: 319ms	remaining: 2m 39s
100:	learn: 0.3277006	total: 27.5s	remaining: 1m 48s
200:	learn: 0.3232787	total: 54.4s	remaining: 1m 20s
300:	learn: 0.3211132	total: 1m 21s	remaining: 53.8s
400:	learn: 0.3197303	total: 1m 49s	remaining: 26.9s
499:	learn: 0.3187380	total: 2m 16s	remaining: 0us

Iteration 2
High confidence pseudo-labels: 218205
0:	learn: 0.6599741	total: 356ms	remaining: 2m 57s
100:	learn: 0.2677816	total: 36.2s	remaining: 2m 22s
200:	learn: 0.2638941	total: 1m 11s	remaining: 1m 46s
300:	learn: 0.2620727	total: 1m 47s	remaining: 1m 10s
400:	learn: 0.2609973	total: 2m 22s	remaining: 35.3s
499:	le

In [72]:
# 테스트 데이터 예측
test_probs = student_model.predict_proba(test)
test_probs = test_probs / test_probs.sum(axis=1, keepdims=True)

# 제출 파일 생성
submission = pd.DataFrame({
    'ID': test['ID'],
    'Not_Delayed': test_probs[:, 0],
    'Delayed': test_probs[:, 1]
})
submission.to_csv('submission.csv', index=False)