In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import category_encoders as ce

In [2]:
df = sm.datasets.get_rdataset('flights', 'nycflights13').data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   year            336776 non-null  int64  
 1   month           336776 non-null  int64  
 2   day             336776 non-null  int64  
 3   dep_time        328521 non-null  float64
 4   sched_dep_time  336776 non-null  int64  
 5   dep_delay       328521 non-null  float64
 6   arr_time        328063 non-null  float64
 7   sched_arr_time  336776 non-null  int64  
 8   arr_delay       327346 non-null  float64
 9   carrier         336776 non-null  object 
 10  flight          336776 non-null  int64  
 11  tailnum         334264 non-null  object 
 12  origin          336776 non-null  object 
 13  dest            336776 non-null  object 
 14  air_time        327346 non-null  float64
 15  distance        336776 non-null  int64  
 16  hour            336776 non-null  int64  
 17  minute    

In [3]:
df.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00


In [4]:
df.isnull().sum()

year                 0
month                0
day                  0
dep_time          8255
sched_dep_time       0
dep_delay         8255
arr_time          8713
sched_arr_time       0
arr_delay         9430
carrier              0
flight               0
tailnum           2512
origin               0
dest                 0
air_time          9430
distance             0
hour                 0
minute               0
time_hour            0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

year              0
month             0
day               0
dep_time          0
sched_dep_time    0
dep_delay         0
arr_time          0
sched_arr_time    0
arr_delay         0
carrier           0
flight            0
tailnum           0
origin            0
dest              0
air_time          0
distance          0
hour              0
minute            0
time_hour         0
dtype: int64

In [7]:
target = 'arr_delay'
y = df[target]
X = df.drop(columns=[target, 'time_hour', 'year', 'dep_time', 'sched_dep_time', 'arr_time', 'sched_arr_time', 'dep_delay'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1066)
X_train.dtypes

month         int64
day           int64
carrier      object
flight        int64
tailnum      object
origin       object
dest         object
air_time    float64
distance      int64
hour          int64
minute        int64
dtype: object

In [8]:
encoder = ce.LeaveOneOutEncoder(return_df=True)

In [9]:
X_train_loo = encoder.fit_transform(X_train, y_train)

In [10]:
X_train.head()

Unnamed: 0,month,day,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
65014,11,11,B6,2180,N337JB,EWR,BOS,41.0,200,6,25
295187,8,16,EV,5268,N754EV,LGA,CLT,88.0,544,19,15
193684,5,1,UA,595,N539UA,EWR,LAS,293.0,2227,9,30
329635,9,23,WN,1517,N492WN,LGA,BNA,108.0,764,11,25
306355,8,28,EV,4361,N12166,EWR,TYS,83.0,631,19,45


In [11]:
y_train.head()

65014    -15.0
295187   -18.0
193684     5.0
329635    -4.0
306355   -30.0
Name: arr_delay, dtype: float64

In [12]:
X_train_loo.head()

Unnamed: 0,month,day,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
65014,11,11,9.616335,2180,10.95339,9.057682,2.69312,41.0,200,6,25
295187,8,16,15.667612,5268,1.217391,5.787209,7.406107,88.0,544,19,15
193684,5,1,3.465949,595,0.222222,9.057469,0.572689,293.0,2227,9,30
329635,9,23,9.560669,1517,-0.272727,5.787036,11.420625,108.0,764,11,25
306355,8,28,15.667905,4361,8.408602,9.057842,25.850446,83.0,631,19,45


In [13]:
X_test_loo = encoder.transform(X_test)

In [14]:
X_test_loo

Unnamed: 0,month,day,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
292988,8,14,7.514331,3400,2.346939,5.560707,13.750583,114.0,745,16,15
196665,5,4,0.331912,1850,-10.854545,5.560707,2.691649,36.0,187,12,45
12477,1,15,10.851902,4670,10.418327,5.786915,11.417495,132.0,764,9,59
68633,11,14,3.465982,1212,3.864407,9.057426,2.634729,352.0,2565,17,29
90245,12,8,7.514331,2917,3.195122,5.560707,5.433753,104.0,509,16,4
...,...,...,...,...,...,...,...,...,...,...,...
103582,12,23,9.615767,683,5.027397,5.560707,5.636966,140.0,944,7,10
147580,3,13,3.465982,443,3.310811,5.560707,0.419967,322.0,2475,8,40
142936,3,8,1.985603,2165,3.550847,5.786915,9.075735,54.0,214,8,0
54857,10,30,10.851902,3374,3.547297,5.560707,9.829630,60.0,427,19,40


In [15]:
linhas_EWR = list(X_train.query("origin=='EWR'").index)

In [16]:
X_train_loo[X_train_loo.index.isin(linhas_EWR)].origin.mean()

9.057425911510103