In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from geopy.distance import geodesic
import numpy as np
from tabulate import tabulate
import joblib

In [2]:
#Making a preprocessing pipeline

# Data loading
df_flightdata = pd.read_csv('data/Train.csv')
df_airportdata = pd.read_csv('data/airportdata.csv', index_col=0)

# Defining categorical data and features to drop. Here we use label encoding. Hot encoding is not used.
categorical_columns = ['depstn', 'arrstn', 'status', 'arr_country', 'dep_country', 'season', 'airline_code', 'international_flight','ac','dep_iata','arr_iata','fltid']
columns_to_drop = ['id', 'std', 'sta', 'fltid', 'arr_iata', 'dep_iata', 'ac']

class ColumnNameFixer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.columns = X.columns.str.replace(' ', '_').str.lower().str.replace('-', '_')
        return X

class CalculateDistance(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        def calculate_distance(row):
            dep_coords = (row['dep_lat'], row['dep_lon'])
            arr_coords = (row['arr_lat'], row['arr_lon'])
            distance = geodesic(dep_coords, arr_coords).kilometers
            return int(round(distance, 0))

        X['flight_distance_in_km'] = X.apply(calculate_distance, axis=1)
        return X

class CustomFeaturesAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['sta'] = pd.to_datetime(X['sta'], format='%Y-%m-%d %H.%M.%S')
        X['std'] = pd.to_datetime(X['std'], format='%Y-%m-%d %H:%M:%S')
        X['datop'] = pd.to_datetime(X['datop'], format='%Y-%m-%d')
        X['std_time'] = X['std'].dt.time
        X['sta_time'] = X['sta'].dt.time
        X['std_time'] = X['std_time'].astype(str).str.replace(':', '').astype(int)
        X['sta_time'] = X['sta_time'].astype(str).str.replace(':', '').astype(int)
        
        X['elevation_dif'] = (X['arr_elevation'] - X['dep_elevation'])
        X['flight_time_in_min'] = (X['sta'] - X['std']).dt.total_seconds() / 60
        X['average_flight_speed_km_h'] = (X['flight_distance_in_km'] * 60 / X['flight_time_in_min']).round().astype(int)
        X['international_flight'] = np.where(X['arr_country'] != X['dep_country'], 1, 0)
        X['airline_code'] = X['fltid'].str[:2]
        # Extract year, month, and day components
        X['year'] = X['datop'].dt.year
        X['month'] = X['datop'].dt.month
        X['day'] = X['datop'].dt.day
        X['datop'] = X['datop'].astype(str).str.replace('-', '').astype(int)
        
        # Create the seasons column
        X.loc[(X['month'] < 3) | (X['month'] == 12), 'season'] = 'winter'
        X.loc[(X['month'] >= 3) & (X['month'] < 6), 'season'] = 'spring' 
        X.loc[(X['month'] >= 6) & (X['month'] < 9), 'season'] = 'summer' 
        X.loc[(X['month'] >= 9) & (X['month'] < 12), 'season'] = 'autumn'
        
        return X

class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.label_encoders = {}

    def fit(self, X, y=None):
        for col in self.columns:
            label_encoder = LabelEncoder()
            label_encoder.fit(X[col])
            self.label_encoders[col] = label_encoder
        return self

    def transform(self, X):
        X_encoded = X.copy()
        for col, label_encoder in self.label_encoders.items():
            X_encoded[col] = label_encoder.transform(X_encoded[col])
        return X_encoded

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.drop(self.columns_to_drop, axis=1, inplace=True)
        return X

# Define the preprocessing steps in the pipeline
preprocessing_steps = [
    ('column_name_fixer', ColumnNameFixer()),
    ('calculate_distance', CalculateDistance()),
    ('custom_features_adder', CustomFeaturesAdder()),
    ('label_encoding', LabelEncoderTransformer(categorical_columns)),
    ('drop_columns', DropColumns(columns_to_drop))

]

# Create the pipeline
preprocessing_pipeline = Pipeline(steps=preprocessing_steps)

# Join the DataFrames first and then apply the preprocessing pipeline
merged_data = df_flightdata.join(df_airportdata[['iata', 'country', 'elevation', 'lat', 'lon']].add_prefix('dep_'), how='left', on='DEPSTN') \
    .join(df_airportdata[['iata', 'country', 'elevation', 'lat', 'lon']].add_prefix('arr_'), how='left', on='ARRSTN')

df_processed = preprocessing_pipeline.fit_transform(merged_data)

# Now df_processed contains the preprocessed data


In [3]:
# Save the preprocessing pipeline to a file
joblib.dump(preprocessing_pipeline, 'data/preprocessing_pipeline.joblib')

['preprocessing_pipeline.joblib']

In [77]:
# tabulate library to compare the df's

# Extract the headers of merged_data
headers_merged = merged_data.columns.tolist()

# Initialize an empty list to store rows
headers_list = []
combined_df = pd.DataFrame()

# Iterate over the first ten rows in both DataFrames and add them to the table_data
for index, header_merged in enumerate(headers_merged):
    if header_merged in df_processed.columns:
        # Headers match, compare data
        header_processed = header_merged
        row_processed = df_processed.head(10).iloc[:, df_processed.columns.get_loc(header_merged)]
        row_merged = merged_data.head(10).iloc[:, index]
    else:
        # Headers don't match, create an empty column
        header_processed = f'{header_merged} (dropped)'
        row_processed = pd.Series([np.nan] * 10)
        row_merged = merged_data.head(10).iloc[:, index]

    # Combine values from both DataFrames in a single cell
    combined_row = []
    for val_processed, val_merged in zip(row_processed, row_merged):
        combined_row.append(f'{val_processed} | {val_merged}')
    headers_list.append(header_processed)
    combined_df[header_processed] = combined_row

# Create the table
# Convert the DataFrame to a table and print it
combined_table = tabulate(combined_df, headers='keys', tablefmt='pretty')
print("\nTable Representation of Combined DataFrame:")
print(combined_table)


Table Representation of Combined DataFrame:
+---+------------------+---------------------+-----------------+-----------+-----------+---------------------------+---------------------------+---------+-----------------+---------------+--------------------+-------------+---------------+-------------------------------+-------------------------------+--------------------+-------------+---------------+-------------------------------+-------------------------------+-----------------------+-----------------+-----------------+-----------------+--------------------+---------------------------+----------------------+--------------+-------------+-------+---------+------------+
|   |   id (dropped)   |        datop        | fltid (dropped) |  depstn   |  arrstn   |       std (dropped)       |       sta (dropped)       | status  |  ac (dropped)   |    target     | dep_iata (dropped) | dep_country | dep_elevation |            dep_lat            |            dep_lon            | arr_iata (dropped) | a