# Import

In [None]:
pip install autogluon

In [None]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import log_loss, accuracy_score, f1_score
from datetime import datetime

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [None]:
# path = '/content/drive/MyDrive/DACON-Flight-Delay/'
path = ''

# csv to parquet
메모리에 효율적인 데이터 유형을 사용하여 용량을 줄이고 빠른 작업이 가능합니다

In [None]:
# def csv_to_parquet(csv_path, save_name):
#     df = pd.read_csv(csv_path)
#     df.to_parquet(f'{save_name}.parquet')
#     del df
#     gc.collect()
#     print(save_name, 'Done.')

In [None]:
# csv_to_parquet(path+'train.csv', './train')
# csv_to_parquet(path+'test.csv', './test')

# Data Load

In [None]:
from autogluon.tabular import TabularPredictor, TabularDataset

In [None]:
train = pd.read_parquet(path+'train.parquet')
test = pd.read_parquet(path+'test.parquet')
sample_submission = pd.read_csv(path+'sample_submission.csv', index_col = 0)

# Data Pre-Processing

In [None]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [None]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [None]:
#레이블이 없는 데이터들을 제거합니다
# train = train.dropna()
nolabel = train[train['Delay'].isnull()]
train = train.dropna()
nolabel

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,600.0,1900.0,0,0,252,13851,34,159,12191,42,419.0,23,10,19393.0,4319,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,256,13930,11,331,14869,45,1250.0,22,8,20304.0,310,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,74,11057,31,204,12953,30,544.0,3,0,19805.0,140,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,195,12892,4,119,11618,28,2454.0,26,8,19393.0,3021,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,322,14771,4,7,10157,4,250.0,22,8,20304.0,556,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,256,13930,4,270,14100,4,678.0,26,8,19977.0,2477,
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,122,11637,4,242,13487,21,223.0,22,3,19393.0,2294,
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,248,13796,4,159,12191,42,1642.0,23,10,19393.0,994,
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,45,10693,41,22,10397,4,214.0,9,3,19790.0,6207,


In [None]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))


In [None]:
time_data_preprocess = lambda x: datetime.strptime(str(int(x)),"%H%M") if x>=100 else datetime.strptime('00'+str(int(x)),"%H%M")
def preprocessing(df):
    if 'Delay_num' in df.columns and 'Delay' in df.columns:
        df_x = df.drop(columns=['ID', 'Delay', 'Delay_num', 'Cancelled',	'Diverted', 'Origin_Airport_ID', 'Destination_Airport_ID', 'Carrier_Code(IATA)'],axis=1)
    elif 'Delay' in df.columns:
        df_x = df.drop(columns=['ID', 'Delay', 'Cancelled',	'Diverted', 'Origin_Airport_ID', 'Destination_Airport_ID', 'Carrier_Code(IATA)'],axis=1)
    else:
        df_x = df.drop(columns=['ID', 'Cancelled',	'Diverted', 'Origin_Airport_ID', 'Destination_Airport_ID', 'Carrier_Code(IATA)'],axis=1)
    df_x.loc[df_x['Estimated_Arrival_Time']==2400, 'Estimated_Arrival_Time'] = 0
    df_x.loc[df_x['Estimated_Departure_Time']==2400, 'Estimated_Departure_Time'] = 0
    df_x['Departure_Time'] = df_x['Estimated_Departure_Time'].apply(time_data_preprocess)
    df_x['Arrival_Time'] = df_x['Estimated_Arrival_Time'].apply(time_data_preprocess)
    df_x['Flight_Time'] = (df_x['Arrival_Time'] - df_x['Departure_Time']).apply(lambda x: int(x.seconds))
    df_x['Time_Per_Distance'] = df_x['Flight_Time'] / df_x['Distance']
    df_x.loc[df_x['Estimated_Arrival_Time']==0, 'Estimated_Arrival_Time'] = 2400
    df_x.loc[df_x['Estimated_Departure_Time']==0, 'Estimated_Departure_Time'] = 2400

    df_x['Month'] = df_x['Month'].astype(str)
    df_x['Day_of_Month'] = df_x['Day_of_Month'].astype(str)
    return df_x

In [None]:
train_x = preprocessing(train)
test_x = preprocessing(test)
# nolabel = preprocessing(nolabel)
train_y = train['Delay_num']

In [None]:
x=50
train_x['Delay_num'] = train_y
label0 = train_x[train_x['Delay_num']==0]
label1 = train_x[train_x['Delay_num']==1]
zero = int((45000*x)/(100-x))
label0 = label0[:zero]
new = pd.concat([label0, label1]).sample(frac=1).reset_index(drop=True)
a = np.count_nonzero(new['Delay_num'] == 0)
print(a, len(new)-a)
print(f"label 0: {a/len(new):.6f}, label1: {(len(new)-a)/len(new):.6f}")
new_x = new.drop(['Delay_num'], axis=1)
new_y = new['Delay_num']

45000 45000
label 0: 0.500000, label1: 0.500000


# Classification Model Fit

In [None]:
predictor = TabularPredictor(label='Delay_num', eval_metric='log_loss', problem_type='binary').fit(new, presets=['best_quality']) # , presets=['best_quality']

No path specified. Models will be saved in: "AutogluonModels/ag-20230510_010335/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230510_010335/"
AutoGluon Version:  0.7.0
Python Version:     3.10.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Sat Dec 10 16:00:40 UTC 2022
Train Data Rows:    90000
Train Data Columns: 16
Label Column: Delay_num
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11129.2 MB
	Train Data (Original)  Memory Usage: 20.61 MB (0.2% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generat

In [None]:
predictor.leaderboard(silent=True) # best quality

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.647039,27.678768,1829.62219,0.007227,6.859321,2,True,14
1,CatBoost_BAG_L1,-0.648148,0.483356,1047.874198,0.483356,1047.874198,1,True,7
2,LightGBMXT_BAG_L1,-0.651846,13.979406,126.956305,13.979406,126.956305,1,True,3
3,LightGBMLarge_BAG_L1,-0.652101,4.656159,90.884311,4.656159,90.884311,1,True,13
4,LightGBM_BAG_L1,-0.652918,3.927783,71.589757,3.927783,71.589757,1,True,4
5,XGBoost_BAG_L1,-0.65313,2.069248,448.757021,2.069248,448.757021,1,True,11
6,ExtraTreesEntr_BAG_L1,-0.663437,5.086632,36.483775,5.086632,36.483775,1,True,9
7,NeuralNetFastAI_BAG_L1,-0.664286,5.808836,1033.224294,5.808836,1033.224294,1,True,10
8,ExtraTreesGini_BAG_L1,-0.664441,4.60025,33.374221,4.60025,33.374221,1,True,8
9,NeuralNetTorch_BAG_L1,-0.666237,1.078776,700.446002,1.078776,700.446002,1,True,12


# Inference

In [None]:
pred = predictor.predict_proba(test_x)

# Submit

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
submission = sample_submission.copy()

In [None]:
Pred = pred.rename({0:'a', 1:'b'}, axis=1)
submission['Not_Delayed'][:] = Pred['a'][:]
submission['Delayed'][:] = Pred['b'][:]

In [None]:
# submission = sample_submission.copy()
c=0
for i in range(1000000):
    if submission['Not_Delayed'][i] != Pred['a'][i]:
        print("다름")
        c+=1
    elif submission['Delayed'][i] != Pred['b'][i]:
        print("다름")
        c+=1
if c==0:
    submission.to_csv('preprocessing_submission.csv', index=True)
print(c)

0


In [None]:
submission

Unnamed: 0_level_0,Not_Delayed,Delayed
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
TEST_000000,0.527546,0.472454
TEST_000001,0.400429,0.599571
TEST_000002,0.426276,0.573724
TEST_000003,0.265933,0.734067
TEST_000004,0.185770,0.814230
...,...,...
TEST_999995,0.306009,0.693991
TEST_999996,0.633605,0.366395
TEST_999997,0.356001,0.643999
TEST_999998,0.302450,0.697550
