# 0. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from my_krml_25246568.data.sets import pop_target
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from joblib import dump

# 1. Load Data

In [2]:
df = pd.read_csv('../data/interim/cleaned_data_1.csv', low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13519999 entries, 0 to 13519998
Data columns (total 15 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   startingAirport           object 
 1   destinationAirport        object 
 2   totalFare                 float64
 3   totalTravelDistance       float64
 4   travelDurationMins        float64
 5   cabinTypeSegmentSegment1  object 
 6   cabinTypeSegmentSegment2  object 
 7   cabinTypeSegmentSegment3  object 
 8   cabinTypeSegmentSegment4  object 
 9   numStops                  int64  
 10  flightMonth               int64  
 11  flightDay                 int64  
 12  flightHour                int64  
 13  flightMinute              int64  
 14  days_difference           int64  
dtypes: float64(3), int64(6), object(6)
memory usage: 1.5+ GB


In [4]:
df.shape

(13519999, 15)

In [5]:
df.head()

Unnamed: 0,startingAirport,destinationAirport,totalFare,totalTravelDistance,travelDurationMins,cabinTypeSegmentSegment1,cabinTypeSegmentSegment2,cabinTypeSegmentSegment3,cabinTypeSegmentSegment4,numStops,flightMonth,flightDay,flightHour,flightMinute,days_difference
0,OAK,ATL,103.98,2150.0,472.0,coach,coach,No cabin,No cabin,1,5,21,11,58,32
1,OAK,ATL,216.58,2412.0,375.0,coach,coach,No cabin,No cabin,1,5,21,1,56,32
2,OAK,ATL,216.58,2412.0,546.0,coach,coach,No cabin,No cabin,1,5,20,23,5,31
3,OAK,ATL,237.58,2412.0,377.0,coach,coach,No cabin,No cabin,1,5,21,13,56,32
4,OAK,ATL,307.21,2850.0,852.0,coach,coach,No cabin,No cabin,1,5,21,14,41,32


# 2. Data Transformation

## [2.1] Ordinal Encoder

In [6]:
df_encoded = df.copy()

In [7]:
oe = OrdinalEncoder(dtype=int)

In [8]:
df_encoded[['startingAirport', 'destinationAirport', 
            'cabinTypeSegmentSegment1', 'cabinTypeSegmentSegment2', 
            'cabinTypeSegmentSegment3', 'cabinTypeSegmentSegment4']] =  oe.fit_transform(df_encoded[['startingAirport', 'destinationAirport', 
                                                                                             'cabinTypeSegmentSegment1', 'cabinTypeSegmentSegment2',
                                                                                             'cabinTypeSegmentSegment3', 'cabinTypeSegmentSegment4']])

## [2.2] Standard Scalar

### [2.2.1] Extracting target variable

In [9]:
df_extracted, target = pop_target(df_encoded, 'totalFare')

To maintain the same level of magnitude between the values standard scaling technique will be applied.

### [2.2.2] Scaling features

In [10]:
scaler = StandardScaler()

In [11]:
scaled = scaler.fit_transform(df_extracted)
df_scaled = pd.DataFrame(scaled, columns=df_extracted.columns)

### [2.2.3] Adding target back

In [12]:
df_scaled['totalFare'] = target

# 3. Saving Transformers

In [13]:
dump(oe, '../models/transformers/ordinal_encoder_final.joblib')

['../models/transformers/ordinal_encoder_final.joblib']

In [14]:
dump(scaler, '../models/transformers/scaler_final.joblib')

['../models/transformers/scaler_final.joblib']

# 4. Saving Transformed CSV

In [15]:
df_scaled.to_csv('../data/processed/processed_data_final.csv', index=False)