In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [12]:
data = pd.read_csv('airlines_flights_data.csv',encoding="latin1")

print(data.head())

   index   airline   flight source_city departure_time stops   arrival_time  \
0      0  SpiceJet  SG-8709       Delhi        Evening  zero          Night   
1      1  SpiceJet  SG-8157       Delhi  Early_Morning  zero        Morning   
2      2   AirAsia   I5-764       Delhi  Early_Morning  zero  Early_Morning   
3      3   Vistara   UK-995       Delhi        Morning  zero      Afternoon   
4      4   Vistara   UK-963       Delhi        Morning  zero        Morning   

  destination_city    class  duration  days_left  price  
0           Mumbai  Economy      2.17          1   5953  
1           Mumbai  Economy      2.33          1   5953  
2           Mumbai  Economy      2.17          1   5956  
3           Mumbai  Economy      2.25          1   5955  
4           Mumbai  Economy      2.33          1   5955  


In [17]:
num_features = ['duration', 'days_left']
num_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])

cat_features = ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
cat_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [20]:
preprocessor = ColumnTransformer(
transformers=[
("num", num_transformer, ['duration', 'days_left']),
("cat", cat_transformer, cat_features)
])
preprocessor.set_output(transform="pandas")
data_preprocessed = preprocessor.fit_transform(data)
print(data_preprocessed)
preprocessor_Out = ColumnTransformer(
transformers=[
("num", num_transformer, ['price'])
])
preprocessor_Out.set_output(transform="pandas")
data_preprocessed_Out = preprocessor_Out.fit_transform(data)
print(data_preprocessed_Out)

        num__duration  num__days_left  cat__airline_AirAsia  \
0           -1.397531       -1.843875                   0.0   
1           -1.375284       -1.843875                   0.0   
2           -1.397531       -1.843875                   1.0   
3           -1.386407       -1.843875                   0.0   
4           -1.375284       -1.843875                   0.0   
...               ...             ...                   ...   
300148      -0.297695        1.695692                   0.0   
300149      -0.250421        1.695692                   0.0   
300150       0.223718        1.695692                   0.0   
300151      -0.308819        1.695692                   0.0   
300152      -0.297695        1.695692                   0.0   

        cat__airline_Air_India  cat__airline_GO_FIRST  cat__airline_Indigo  \
0                          0.0                    0.0                  0.0   
1                          0.0                    0.0                  0.0   
2        

In [22]:
X = data_preprocessed
y = data_preprocessed_Out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.head())
print(y_train.head())

        num__duration  num__days_left  cat__airline_AirAsia  \
148417       1.000972       -1.475170                   0.0   
36879       -0.725950       -0.958983                   0.0   
274531       1.244299        1.326987                   0.0   
166397      -0.274058       -1.106465                   0.0   
272722       1.985402       -1.548911                   0.0   

        cat__airline_Air_India  cat__airline_GO_FIRST  cat__airline_Indigo  \
148417                     1.0                    0.0                  0.0   
36879                      0.0                    0.0                  1.0   
274531                     1.0                    0.0                  0.0   
166397                     0.0                    0.0                  0.0   
272722                     0.0                    0.0                  0.0   

        cat__airline_SpiceJet  cat__airline_Vistara  \
148417                    0.0                   0.0   
36879                     0.0             