<a href="https://colab.research.google.com/github/ssyeon2/Flight-Delay-Prediction/blob/main/Flight_Delay_predict(logistic_regression).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. 기본 설정

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
import warnings
warnings.simplefilter('ignore')

### data 불러오기

In [7]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [10]:
csv_to_parquet('/content/drive/MyDrive/항공기/open/train.csv', 'train')
csv_to_parquet('/content/drive/MyDrive/항공기/open/test.csv', 'test')

train Done.
test Done.


In [12]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('/content/drive/MyDrive/항공기/sample_submission.csv', index_col = 0)

# 2.Data Pre-Processing

##1) shape 확인

In [13]:
train.shape, test.shape

((1000000, 19), (1000000, 18))

## 2) 결측치 개수 확인

In [14]:
print(train.isna().sum())
print('----------------------------')
print(train.isna().sum() / train.shape[0])

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time    109019
Estimated_Arrival_Time      109040
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                109015
Destination_Airport              0
Destination_Airport_ID           0
Destination_State           109079
Distance                         0
Airline                     108920
Carrier_Code(IATA)          108990
Carrier_ID(DOT)             108997
Tail_Number                      0
Delay                       744999
dtype: int64
----------------------------
ID                          0.000000
Month                       0.000000
Day_of_Month                0.000000
Estimated_Departure_Time    0.109019
Estimated_Arrival_Time      0.109040
Cancelled                   0.000000
Diverted                    0.000000
Origin_Airport              0.0000

In [15]:
target = 'Delay'
train[target].isna().sum()

744999

## 3) 모두 같은 값을 가지는 열 제거

In [16]:
# 값이 모두 0임
train['Cancelled'].value_counts(), train['Diverted'].value_counts()

(0    1000000
 Name: Cancelled, dtype: int64,
 0    1000000
 Name: Diverted, dtype: int64)

In [None]:
train = train.drop(['Cancelled', 'Diverted'], axis=1)
test = test.drop(['Cancelled', 'Diverted'], axis=1)

### 결측치 채우기

In [None]:
# Origin_Airport_ID가 같은 값으로 Airline 채우기
for i in range(len(train)):
    if train['Airline'][i] == None:
        for j in range(len(a)):
            if train['Tail_Number'][i] == a['Tail_Number'][j] :
                train.loc[i, 'Airline'] = a['Airline'][j]
                break
            else:
                pass

In [None]:
train_state = train.groupby(by = ['Origin_State', 'Origin_Airport_ID', 'Destination_State', 'Destination_Airport_ID'], as_index = False)[['ID']].count()

In [None]:
train_state = train.groupby(by = ['Origin_State', 'Origin_Airport_ID', 'Destination_State', 'Destination_Airport_ID'], as_index = False)[['ID']].count()

# Origin_Airport_ID가 같은 값으로 Origin_State 채우기
for i in range(len(train)):
    if train['Origin_State'][i] == None:
        for j in range(len(train_state)):
            if train['Origin_Airport_ID'][i] == train_state['Origin_Airport_ID'][j] :
                train.loc[i, 'Origin_State'] = train_state['Origin_State'][j]
                break
            else:
                pass

tail = train.groupby(['Airline', 'Carrier_ID(DOT)'], as_index= False)[['ID']].count()

count = 0
for i in tail['Carrier_ID(DOT)'].value_counts():
    if i != 1 :
        count += 1
    else :
        pass


train.loc[train['Airline'].isna(), 'Airline'] = 'isnull'
train.loc[train['Airline']== 'isnull', 'Airline'] = None


for i in range(len(train)):
    if train['Airline'][i] == None:
        for j in range(len(tail)):
            if train['Carrier_ID(DOT)'][i] == tail['Carrier_ID(DOT)'][j] :
                train.loc[i, 'Airline'] = tail['Airline'][j]
                break
            else:
                pass


# Airline과 Carrier_ID(DOT)의 결측치를 1:1대응으로 채우기

# 결측값을 None으로 바꿔주기
train.loc[train['Carrier_ID(DOT)'].isna(), 'Carrier_ID(DOT)'] = 'isnull'
train.loc[train['Carrier_ID(DOT)']== 'isnull', 'Carrier_ID(DOT)'] = None

for i in range(len(train)):
    if train['Carrier_ID(DOT)'][i] == None:
        count = 0
        for j in range(len(tail)):
            if train['Airline'][i] == tail['Airline'][j] :
                train.loc[i, 'Carrier_ID(DOT)'] = tail['Carrier_ID(DOT)'][j]
                break
            else:
                count += 1
                pass

train_state = train.groupby(by = ['Origin_State', 'Origin_Airport_ID', 'Destination_State', 'Destination_Airport_ID'], as_index = False)[['ID']].count()

for i in range(1,len(train)):
    if train['Destination_State'][i] == None:
        count = 0
        for j in range(1,len(train_state)):
            if train['Destination_Airport_ID'][i] == train_state['Destination_Airport_ID'][j] :
                train.loc[i, 'Destination_State'] = train_state['Destination_State'][j]
                break
            else:
                count += 1
                pass

#1 : 1 대응 변수 삭제
train = train.drop(['Origin_Airport_ID', 'Destination_Airport_ID', 'Carrier_Code(IATA)', 'Cancelled', 'Diverted'], axis = 1)

# Origin_Airport_ID가 같은 값으로 Origin_State 채우기
for i in range(len(train)):
    if train['Origin_State'][i] == None:
        for j in range(len(train_state)):
            if train['Origin_Airport_ID'][i] == train_state['Origin_Airport_ID'][j] :
                train.loc[i, 'Origin_State'] = train_state['Origin_State'][j]
                break
            else:
                pass

tail = train.groupby(['Airline', 'Carrier_ID(DOT)'], as_index= False)[['ID']].count()

count = 0
for i in tail['Carrier_ID(DOT)'].value_counts():
    if i != 1 :
        count += 1
    else :
        pass

train.loc[train['Airline'].isna(), 'Airline'] = 'isnull'
train.loc[train['Airline']== 'isnull', 'Airline'] = None


for i in range(len(train)):
    if train['Airline'][i] == None:
        for j in range(len(tail)):
            if train['Carrier_ID(DOT)'][i] == tail['Carrier_ID(DOT)'][j] :
                train.loc[i, 'Airline'] = tail['Airline'][j]
                break
            else:
                pass

# Airline과 Carrier_ID(DOT)의 결측치를 1:1대응으로 채우기

# 결측값을 None으로 바꿔주기
train.loc[train['Carrier_ID(DOT)'].isna(), 'Carrier_ID(DOT)'] = 'isnull'
train.loc[train['Carrier_ID(DOT)']== 'isnull', 'Carrier_ID(DOT)'] = None

for i in range(len(train)):
    if train['Carrier_ID(DOT)'][i] == None:
        count = 0
        for j in range(len(tail)):
            if train['Airline'][i] == tail['Airline'][j] :
                train.loc[i, 'Carrier_ID(DOT)'] = tail['Carrier_ID(DOT)'][j]
                break
            else:
                count += 1
                pass

train_state = train.groupby(by = ['Origin_State', 'Origin_Airport_ID', 'Destination_State', 'Destination_Airport_ID'], as_index = False)[['ID']].count()

for i in range(1,len(train)):
    if train['Destination_State'][i] == None:
        count = 0
        for j in range(1,len(train_state)):
            if train['Destination_Airport_ID'][i] == train_state['Destination_Airport_ID'][j] :
                train.loc[i, 'Destination_State'] = train_state['Destination_State'][j]
                break
            else:
                count += 1
                pass

#1 : 1 대응 변수 삭제
train = train.drop(['Origin_Airport_ID', 'Destination_Airport_ID', 'Carrier_Code(IATA)'], axis = 1)

a = train.loc[train['Estimated_Departure_Time'].isna() & train['Estimated_Arrival_Time'].isna()]
train = train.drop(a.index , axis = 0)


from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)

train['Estimated_Departure_Time'] = imputer.fit_transform(train[['Estimated_Departure_Time']])
train['Estimated_Arrival_Time'] = imputer.fit_transform(train[['Estimated_Arrival_Time']])

#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Airline','Carrier_ID(DOT)', 'Destination_State']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)


In [None]:
test_state = test.groupby(by = ['Origin_State', 'Origin_Airport_ID', 'Destination_State', 'Destination_Airport_ID'], as_index = False)[['ID']].count()

# Origin_Airport_ID가 같은 값으로 Origin_State 채우기
for i in range(len(test)):
    if test['Origin_State'][i] == None:
        for j in range(len(test_state)):
            if test['Origin_Airport_ID'][i] == test_state['Origin_Airport_ID'][j] :
                test.loc[i, 'Origin_State'] = test_state['Origin_State'][j]
                break
            else:
                pass

tail = test.groupby(['Airline', 'Carrier_ID(DOT)'], as_index= False)[['ID']].count()

count = 0
for i in tail['Carrier_ID(DOT)'].value_counts():
    if i != 1 :
        count += 1
    else :
        pass


test.loc[test['Airline'].isna(), 'Airline'] = 'isnull'
test.loc[test['Airline']== 'isnull', 'Airline'] = None


for i in range(len(test)):
    if test['Airline'][i] == None:
        for j in range(len(tail)):
            if test['Carrier_ID(DOT)'][i] == tail['Carrier_ID(DOT)'][j] :
                test.loc[i, 'Airline'] = tail['Airline'][j]
                break
            else:
                pass


# Airline과 Carrier_ID(DOT)의 결측치를 1:1대응으로 채우기

# 결측값을 None으로 바꿔주기
test.loc[test['Carrier_ID(DOT)'].isna(), 'Carrier_ID(DOT)'] = 'isnull'
test.loc[test['Carrier_ID(DOT)']== 'isnull', 'Carrier_ID(DOT)'] = None

for i in range(len(test)):
    if test['Carrier_ID(DOT)'][i] == None:
        count = 0
        for j in range(len(tail)):
            if test['Airline'][i] == tail['Airline'][j] :
                test.loc[i, 'Carrier_ID(DOT)'] = tail['Carrier_ID(DOT)'][j]
                break
            else:
                count += 1
                pass

test_state = test.groupby(by = ['Origin_State', 'Origin_Airport_ID', 'Destination_State', 'Destination_Airport_ID'], as_index = False)[['ID']].count()

for i in range(1,len(test)):
    if test['Destination_State'][i] == None:
        count = 0
        for j in range(1,len(test_state)):
            if test['Destination_Airport_ID'][i] == test_state['Destination_Airport_ID'][j] :
                test.loc[i, 'Destination_State'] = test_state['Destination_State'][j]
                break
            else:
                count += 1
                pass

#1 : 1 대응 변수 삭제
test = test.drop(['Origin_Airport_ID', 'Destination_Airport_ID', 'Carrier_Code(IATA)', 'Cancelled', 'Diverted'], axis = 1)

# Origin_Airport_ID가 같은 값으로 Origin_State 채우기
for i in range(len(test)):
    if test['Origin_State'][i] == None:
        for j in range(len(test_state)):
            if test['Origin_Airport_ID'][i] == test_state['Origin_Airport_ID'][j] :
                test.loc[i, 'Origin_State'] = test_state['Origin_State'][j]
                break
            else:
                pass

tail = test.groupby(['Airline', 'Carrier_ID(DOT)'], as_index= False)[['ID']].count()

count = 0
for i in tail['Carrier_ID(DOT)'].value_counts():
    if i != 1 :
        count += 1
    else :
        pass

test.loc[test['Airline'].isna(), 'Airline'] = 'isnull'
test.loc[test['Airline']== 'isnull', 'Airline'] = None


for i in range(len(test)):
    if test['Airline'][i] == None:
        for j in range(len(tail)):
            if test['Carrier_ID(DOT)'][i] == tail['Carrier_ID(DOT)'][j] :
                test.loc[i, 'Airline'] = tail['Airline'][j]
                break
            else:
                pass

# Airline과 Carrier_ID(DOT)의 결측치를 1:1대응으로 채우기

# 결측값을 None으로 바꿔주기
test.loc[test['Carrier_ID(DOT)'].isna(), 'Carrier_ID(DOT)'] = 'isnull'
test.loc[test['Carrier_ID(DOT)']== 'isnull', 'Carrier_ID(DOT)'] = None

for i in range(len(test)):
    if test['Carrier_ID(DOT)'][i] == None:
        count = 0
        for j in range(len(tail)):
            if test['Airline'][i] == tail['Airline'][j] :
                test.loc[i, 'Carrier_ID(DOT)'] = tail['Carrier_ID(DOT)'][j]
                break
            else:
                count += 1
                pass

test_state = test.groupby(by = ['Origin_State', 'Origin_Airport_ID', 'Destination_State', 'Destination_Airport_ID'], as_index = False)[['ID']].count()

for i in range(1,len(test)):
    if test['Destination_State'][i] == None:
        count = 0
        for j in range(1,len(test_state)):
            if test['Destination_Airport_ID'][i] == test_state['Destination_Airport_ID'][j] :
                test.loc[i, 'Destination_State'] = test_state['Destination_State'][j]
                break
            else:
                count += 1
                pass

#1 : 1 대응 변수 삭제
test = test.drop(['Origin_Airport_ID', 'Destination_Airport_ID', 'Carrier_Code(IATA)'], axis = 1)

a = test.loc[test['Estimated_Departure_Time'].isna() & test['Estimated_Arrival_Time'].isna()]
test = test.drop(a.index , axis = 0)


from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)

test['Estimated_Departure_Time'] = imputer.fit_transform(test[['Estimated_Departure_Time']])
test['Estimated_Arrival_Time'] = imputer.fit_transform(test[['Estimated_Arrival_Time']])

#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Airline','Carrier_ID(DOT)', 'Destination_State']

for col in NaN_col:
    mode = test[col].mode()[0]
    test[col] = test[col].fillna(mode)


### 파생변수 생성

In [None]:
# 총 소요시간
train['total_time'] = None
for i in range(len(train)):
    if train['Estimated_Departure_Time'][i] > train['Estimated_Arrival_Time'][i] :
        train.loc[i, 'total_time'] = 2400-(train['Estimated_Arrival_Time'][i] - train['Estimated_Departure_Time'][i])
    else :
        train.loc[i, 'total_time'] = train['Estimated_Arrival_Time'][i] - train['Estimated_Departure_Time'][i]

test['total_time'] = None
for i in range(len(test)):
    if test['Estimated_Departure_Time'][i] > test['Estimated_Arrival_Time'][i] :
        test.loc[i, 'total_time'] = 2400-(test['Estimated_Arrival_Time'][i] - test['Estimated_Departure_Time'][i])
    else :
        test.loc[i, 'total_time'] = test['Estimated_Arrival_Time'][i] - test['Estimated_Departure_Time'][i]

In [None]:
#시간과 분
train['hour_de'] = train['Estimated_Departure_Time'].apply(lambda x: int(x/100))
train['minute_de'] = train['Estimated_Departure_Time'].apply(lambda x: int(x%100))
test['hour_de'] = test['Estimated_Departure_Time'].apply(lambda x: int(x/100))
test['minute_de'] = test['Estimated_Departure_Time'].apply(lambda x: int(x%100))

train['hour_arrival'] = train['Estimated_Arrival_Time'].apply(lambda x: int(x/100))
train['minute_arrival'] = train['Estimated_Arrival_Time'].apply(lambda x: int(x%100))
test['hour_arrival'] = test['Estimated_Arrival_Time'].apply(lambda x: int(x/100))
test['minute_arrival'] = test['Estimated_Arrival_Time'].apply(lambda x: int(x%100))


In [None]:
#시간 범주화

train.loc[(train['hour_de'] >=0) & (train['hour_de'] < 3), 'departure_time'] = 1
train.loc[(train['hour_de'] >= 3) & (train['hour_de'] < 6), 'departure_time'] = 2
train.loc[(train['hour_de'] >= 6) & (train['hour_de'] < 9), 'departure_time'] = 3
train.loc[(train['hour_de'] >= 9) & (train['hour_de'] < 12), 'departure_time'] = 4
train.loc[(train['hour_de'] >= 12) & (train['hour_de'] < 15), 'departure_time'] = 5
train.loc[(train['hour_de'] >= 15) & (train['hour_de'] < 18), 'departure_time'] = 6
train.loc[(train['hour_de'] >= 18) & (train['hour_de'] < 21), 'departure_time'] = 7
train.loc[train['hour_de'] >= 21 , 'departure_time'] = 8

test.loc[(test['hour_de'] >= 0) & (test['hour_de'] < 3), 'departure_time'] = 1
test.loc[(test['hour_de'] >= 3) & (test['hour_de'] < 6), 'departure_time'] = 2
test.loc[(test['hour_de'] >= 6) & (test['hour_de'] < 9), 'departure_time'] = 3
test.loc[(test['hour_de'] >= 9) & (test['hour_de'] < 12), 'departure_time'] = 4
test.loc[(test['hour_de'] >= 12) & (test['hour_de'] < 15), 'departure_time'] = 5
test.loc[(test['hour_de'] >= 15) & (test['hour_de'] < 18), 'departure_time'] = 6
test.loc[(test['hour_de'] >= 18) & (test['hour_de'] < 21), 'departure_time'] = 7
test.loc[test['hour_de'] >= 21,'departure_time'] = 8

train.loc[(train['hour_arrival'] >=0) & (train['hour_arrival'] < 3), 'arrival_time'] = 1
train.loc[(train['hour_arrival'] >= 3) & (train['hour_arrival'] < 6), 'arrival_time'] = 2
train.loc[(train['hour_arrival'] >= 6) & (train['hour_arrival'] < 9), 'arrival_time'] = 3
train.loc[(train['hour_arrival'] >= 9) & (train['hour_arrival'] < 12), 'arrival_time'] = 4
train.loc[(train['hour_arrival'] >= 12) & (train['hour_arrival'] < 15), 'arrival_time'] = 5
train.loc[(train['hour_arrival'] >= 15) & (train['hour_arrival'] < 18), 'arrival_time'] = 6
train.loc[(train['hour_arrival'] >= 18) & (train['hour_arrival'] < 21), 'arrival_time'] = 7
train.loc[train['hour_arrival'] >= 21 , 'arrival_time'] = 8

test.loc[(test['hour_arrival'] >= 0) & (test['hour_arrival'] < 3), 'arrival_time'] = 1
test.loc[(test['hour_arrival'] >= 3) & (test['hour_arrival'] < 6), 'arrival_time'] = 2
test.loc[(test['hour_arrival'] >= 6) & (test['hour_arrival'] < 9), 'arrival_time'] = 3
test.loc[(test['hour_arrival'] >= 9) & (test['hour_arrival'] < 12), 'arrival_time'] = 4
test.loc[(test['hour_arrival'] >= 12) & (test['hour_arrival'] < 15), 'arrival_time'] = 5
test.loc[(test['hour_arrival'] >= 15) & (test['hour_arrival'] < 18), 'arrival_time'] = 6
test.loc[(test['hour_arrival'] >= 18) & (test['hour_arrival'] < 21), 'arrival_time'] = 7
test.loc[test['hour_arrival'] >= 21,'arrival_time'] = 8


In [None]:
#불필요한 행 제거 ,'hour_de', 'hour_arrival'
train = train.drop(['Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Day_of_Month','Tail_Number','hour_de', 'hour_arrival', 'minute_de', 'minute_arrival'], axis = 1)
test = test.drop(['Estimated_Departure_Time','Estimated_Arrival_Time', 'Day_of_Month', 'Tail_Number','hour_de', 'hour_arrival','minute_de', 'minute_arrival'], axis = 1)


## 4) 범주화
이산형변수 값 많은 것 > 빈도형 더미화\
아닌변수 > get_dummies

### 1) 빈도형 더미화
'Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_ID(DOT)'

In [None]:
target = 'Delay'
X = train.drop(target, axis = 1)
y = train[target]

In [None]:
df = pd.concat([X,test], axis = 0)
FEdf = df.loc[:,['Origin_Airport', 'Origin_State',
       'Destination_Airport', 'Destination_State', 'Airline',
       'Carrier_ID(DOT)']]
x_train_fe = X.loc[:,['Origin_Airport', 'Origin_State',
       'Destination_Airport', 'Destination_State', 'Airline',
       'Carrier_ID(DOT)']]
test_fe = test.loc[:, ['Origin_Airport', 'Origin_State',
       'Destination_Airport', 'Destination_State', 'Airline',
       'Carrier_ID(DOT)']]
for col in FEdf.columns:
    freq = FEdf[col].value_counts()
    x_train_fe[col] = x_train_fe[col].map(freq)
    test_fe[col] = test_fe[col].map(freq)

In [None]:
test_fe_y = test.loc[:,['Month','Distance', 'total_time', 'departure_time', 'arrival_time']].reset_index()
test_z = pd.merge(test_fe_y, test_fe.reset_index(), on= 'index', how = 'inner')
test_z = test_z.drop('index', axis = 1)

x_train_fe_y = X.loc[:,['Month','Distance', 'total_time','departure_time', 'arrival_time']].reset_index()
x_train_fe= pd.merge(x_train_fe_y, x_train_fe.reset_index(), on= 'index', how = 'inner')
x_train_z = x_train_fe.drop('index', axis = 1)

### 2) one-hot encoding
'Month','departure_time', 'arrival_time'

In [None]:
a = 'Month','departure_time', 'arrival_time'
train_dummy = X.loc[:, a]
train_dummy = train_dummy.astype(str)
train_dummy = pd.get_dummies(train_dummy, drop_first= True).reset_index()
test_dummy = test.loc[:, a]
test_dummy = test_dummy.astype(str)
test_dummy = pd.get_dummies(test_dummy, drop_first= True).reset_index()

test_fe_y = test.drop(['Month','departure_time', 'arrival_time'], axis = 1).reset_index()
x_train_fe_y = X.drop(['Month','departure_time', 'arrival_time'], axis = 1).reset_index()
x_train_k = pd.merge(x_train_fe_y, train_dummy, on= 'index', how = 'inner')
test_k = pd.merge(test_fe_y, test_dummy, on= 'index', how = 'inner')

x_train_k = x_train_k.drop('index', axis = 1)
test_k = test_k.drop('index', axis = 1)

## 5) 정규화

### 1) MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

columns_train = list(x_train_k)
columns_test = list(test_k)

x_train_s = scaler.fit_transform(x_train_k)
test_s = scaler.transform(test_k)

x_train_k = pd.DataFrame(x_train_s, columns = columns_train)
test_k = pd.DataFrame(test_s, columns = columns_test)

# 3. target 나누기

In [None]:
train = pd.merge(x_train_k.reset_index(), y.reset_index(), on= 'index', how = 'inner')
train = train.drop('index', axis = 1)

In [None]:
target = 'Delay'

X = train.drop(target, axis = 1)
y = train[target]
#y= np.where((y == 'Not_Delayed'), 0 , 1)

In [None]:

train_a = train.loc[(train['Delay'] == 'Not_Delayed') | (train['Delay'] == 'Delayed')]

X_labeled = train_a.drop(target, axis = 1)
y_labeled = train_a[target]

train_b = train.loc[(train['Delay'] != 'Not_Delayed') & (train['Delay'] != 'Delayed')]

X_unlabeled = train_b.drop(target, axis = 1)

# 4. Machin Learning


### 1) 비지도학습

import

In [None]:
! pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [None]:
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.svm import SVC


#### 1. RandomForest

In [None]:

# SelfTrainingClassifier 모델을 초기화합니다.
model = SelfTrainingClassifier(
    RandomForestClassifier(),
    max_iter=50,
    threshold=0.9,
    verbose=True
)

# 초기 레이블링 데이터셋으로 모델을 학습시킵니다.
model.fit(X_labeled, y_labeled)



In [None]:
# 모델을 사용하여 예측합니다.
X_unlabeled['Delay_pred'] = model.predict(X_unlabeled)


In [None]:
# train_lb의 'Delay'칼럼값이 -1인 레이블값 예측값으로 채우기
for i in range(len(X_unlabeled)):
  row_index=X_unlabeled['Delay_pred'].index[i]
  train.loc[row_index,'Delay']=X_unlabeled['Delay_pred'].iloc[i]

#### CatBoost

###### gridsearch

In [None]:

# Define the SelfTrainingClassifier
model = SelfTrainingClassifier( CatBoostClassifier())

# Define the parameter grid for grid search
param_grid = {
    'max_iter': range(30,100),
    'threshold': (0.3, 1.1, 0.1),
}

# Define the GridSearchCV object
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_macro',  # or other appropriate scoring metric
    cv=5
)

# Fit the GridSearchCV object to the data
grid_search.fit(X_labeled, y_labeled)


In [None]:
# 모델을 사용하여 예측
X_unlabeled['Delay_pred'] = grid_search.predict(X_unlabeled)


In [None]:
from sklearn.semi_supervised import SelfTrainingClassifier
from catboost import CatBoostClassifier


# SelfTrainingClassifier 모델을 초기화합니다.
model = SelfTrainingClassifier(
    CatBoostClassifier(),
    max_iter=50,
    threshold=0.9,
    verbose=True
)

# 초기 레이블링 데이터셋으로 모델을 학습시킵니다.
model.fit(X_labeled, y_labeled)


In [None]:
# 모델을 사용하여 예측
X_unlabeled['Delay_pred'] = model.predict(X_unlabeled)


In [None]:
# train_lb의 'Delay'칼럼값이 -1인 레이블값 예측값으로 채우기
for i in range(len(X_unlabeled)):
  row_index=X_unlabeled['Delay_pred'].index[i]
  train.loc[row_index,'Delay']=X_unlabeled['Delay_pred'].iloc[i]

#### SVC

In [None]:
# SelfTrainingClassifier 모델을 초기화합니다.
model = SelfTrainingClassifier(
    SVC(),
    max_iter=50,
    threshold=0.9,
    verbose=True
)

# 초기 레이블링 데이터셋으로 모델을 학습시킵니다.
model.fit(X_labeled, y_labeled)


In [None]:
# 모델을 사용하여 예측
X_unlabeled['Delay_pred'] = model.predict(X_unlabeled)


In [None]:
# train_lb의 'Delay'칼럼값이 -1인 레이블값 예측값으로 채우기
for i in range(len(X_unlabeled)):
  row_index=X_unlabeled['Delay_pred'].index[i]
  train.loc[row_index,'Delay']=X_unlabeled['Delay_pred'].iloc[i]

### 2) oversampling

In [None]:
x_train = train.drop('Delay', axis = 1)
y_train = train['Delay']

In [None]:

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train = le.fit_transform(y_train)


In [None]:
pd.Series(y_train).value_counts()# 0 : Delayed, 1: Not_Delayed

1    890310
0     98002
dtype: int64

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
s_x_train, s_y_train = smote.fit_resample(x_train, y_train)


print('전 : ', np.bincount(y_train))
print('후 : ', np.bincount(s_y_train))

전 :  [ 98002 890310]
후 :  [890310 890310]


## 3) pca

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(s_x_train)

variance_ratio = pca.explained_variance_ratio_

print(variance_ratio)



In [None]:
pca = PCA(n_components=2)
train_pca = pca.fit_transform(s_x_train)

# test 데이터에 대해 PCA 변환을 수행합니다.
test_pca = pca.transform(test)

train_pca = pd.DataFrame(train_pca)
test_pca = pd.DataFrame(test_pca)

X = train_pca.values
test_k= test_pca.values

## 4) 지도학습

### 1) Cat Boost

In [None]:
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
cat = CatBoostClassifier()

cat.fit(s_x_train, s_y_train)

In [None]:
y_pred_cat = cat.predict_proba(test_k)

### 2)  Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(s_x_train, s_y_train)

In [None]:
y_pred_lr = lr.predict_proba(test_k)

### 3) gridsearch(cat_boost)

In [None]:
parameters = {'depth'         : [4,5,6,7,8,9, 10],
                'learning_rate' : [0.01,0.02,0.03,0.04],
                'iterations'    : [10, 20,30,40,50,60,70,80,90, 100]
                 }
cat = CatBoostClassifier()

cat_grid = GridSearchCV(cat, param_grid = parameters, cv = 2, n_jobs=-1)
cat_grid.fit(s_x_train, s_y_train)

In [None]:
y_pred_cat_grid = cat_grid.predict_proba(test_k)

# 제출

In [None]:
submission = pd.DataFrame(data=y_pred_lr, columns=sample_submission.columns, index=sample_submission.index)

In [None]:
submission.to_csv('./submission.csv', index=True)