In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder
from geopy.distance import geodesic
import joblib


In [3]:
# Importing data
df = pd.read_csv('data/labelencoded_data.csv',index_col=0)


In [4]:
# Importing data
df = pd.read_csv('data/labelencoded_data.csv',index_col=0)
# df.drop(['id', 'std', 'sta', 'fltid','arr_iata','dep_iata','ac'], axis=1,inplace=True)


In [17]:
#Making a preprocessing pipeline

# Data loading
df_flightdata = pd.read_csv('data/Train.csv')
df_airportdata = pd.read_csv('data/airportdata.csv', index_col=0)

# Defining categorical data and features to drop. Here we use label encoding. Hot encoding is not used.
columns_to_drop = ['id', 'std', 'sta', 'fltid', 'arr_iata', 'dep_iata', 'ac','status']
categorical_columns = ['depstn', 'arrstn',  'arr_country', 'dep_country', 'season', 'airline_code', 'international_flight']

# Join the DataFrames first and then apply the preprocessing pipeline
df = df_flightdata.join(df_airportdata[['iata', 'country', 'elevation', 'lat', 'lon']].add_prefix('dep_'), how='left', on='DEPSTN') \
    .join(df_airportdata[['iata', 'country', 'elevation', 'lat', 'lon']].add_prefix('arr_'), how='left', on='ARRSTN')
    

In [18]:
df.head()

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target,dep_iata,dep_country,dep_elevation,dep_lat,dep_lon,arr_iata,arr_country,arr_elevation,arr_lat,arr_lon
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0,CMN,MA,656.0,33.3675,-7.58997,TUN,TN,22.0,36.851002,10.2272
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0,MXP,IT,768.0,45.6306,8.72811,TUN,TN,22.0,36.851002,10.2272
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0,TUN,TN,22.0,36.851002,10.2272,IST,TR,325.0,41.275333,28.752
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0,DJE,TN,19.0,33.875,10.7755,NTE,FR,90.0,47.153198,-1.61073
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0,TUN,TN,22.0,36.851002,10.2272,ALG,DZ,82.0,36.691002,3.21541


In [19]:
class FixColumnNames(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.columns = X.columns.str.replace(' ', '_').str.lower().str.replace('-', '_')
        return X
    
    def get_state(self):
        # Return a dictionary with any essential attributes
        return {}

    @classmethod
    def from_state(cls, state):
        # Create an instance of the class using the state dictionary
        return cls()

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.drop(self.columns_to_drop, axis=1, inplace=True)
        return X

    def get_state(self):
        # Return a dictionary with any essential attributes
        return {'columns_to_drop': self.columns_to_drop}

    @classmethod
    def from_state(cls, state):
        # Create an instance of the class using the state dictionary
        return cls(columns_to_drop=state['columns_to_drop'])
    
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.label_encoders = {}

    def fit(self, X, y=None):
        for col in self.columns:
            label_encoder = LabelEncoder()
            label_encoder.fit(X[col])
            self.label_encoders[col] = label_encoder
        return self

    def transform(self, X):
        X_encoded = X.copy()
        for col, label_encoder in self.label_encoders.items():
            X_encoded[col] = label_encoder.transform(X_encoded[col])
        return X_encoded
    
    def get_state(self):
        state = {
            'columns': self.columns,
            'label_encoders': {col: label_encoder.classes_.tolist() for col, label_encoder in self.label_encoders.items()}
        }
        return state

    @classmethod
    def from_state(cls, state):
        columns = state['columns']
        label_encoders = {col: LabelEncoder() for col in columns}

        for col, classes in state['label_encoders'].items():
            label_encoder = label_encoders[col]
            label_encoder.classes_ = classes

        instance = cls(columns=columns)
        instance.label_encoders = label_encoders

        return instance
    
class CalculateFlightDistance(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        def calculate_distance(row):
            dep_coords = (row['dep_lat'], row['dep_lon'])
            arr_coords = (row['arr_lat'], row['arr_lon'])
            distance = geodesic(dep_coords, arr_coords).kilometers
            return int(round(distance, 0))

        X['flight_distance_in_km'] = X.apply(calculate_distance, axis=1)
        return X

    def get_state(self):
        # Return a dictionary with any essential attributes
        return {}

    @classmethod
    def from_state(cls, state):
        # Create an instance of the class using the state dictionary
        return cls()

class AddAdditionalFlightDataFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['sta'] = pd.to_datetime(X['sta'], format='%Y-%m-%d %H.%M.%S')
        X['std'] = pd.to_datetime(X['std'], format='%Y-%m-%d %H:%M:%S')
        X['datop'] = pd.to_datetime(X['datop'], format='%Y-%m-%d')
        X['std_time'] = X['std'].dt.time
        X['sta_time'] = X['sta'].dt.time
        X['std_time'] = X['std_time'].astype(str).str.replace(':', '').astype(int)
        X['sta_time'] = X['sta_time'].astype(str).str.replace(':', '').astype(int)
        
        X['elevation_dif'] = (X['arr_elevation'] - X['dep_elevation'])
        X['flight_time_in_min'] = (X['sta'] - X['std']).dt.total_seconds() / 60
        X['average_flight_speed_km_h'] = (X['flight_distance_in_km'] * 60 / X['flight_time_in_min']).round().astype(int)
        X['international_flight'] = np.where(X['arr_country'] != X['dep_country'], 'international', 'domestic')
        X['airline_code'] = X['fltid'].str[:2]
        # Extract year, month, and day components
        X['year'] = X['datop'].dt.year
        X['month'] = X['datop'].dt.month
        X['day'] = X['datop'].dt.day
        X['datop'] = X['datop'].astype(str).str.replace('-', '').astype(int)
        
        # Create the seasons column
        X.loc[(X['month'] < 3) | (X['month'] == 12), 'season'] = 'winter'
        X.loc[(X['month'] >= 3) & (X['month'] < 6), 'season'] = 'spring' 
        X.loc[(X['month'] >= 6) & (X['month'] < 9), 'season'] = 'summer' 
        X.loc[(X['month'] >= 9) & (X['month'] < 12), 'season'] = 'autumn'
        
        return X
    
    def get_state(self):
        # Return a dictionary with any essential attributes
        return {}

    @classmethod
    def from_state(cls, state):
        # Create an instance of the class using the state dictionary
        return cls()


# Define the preprocessing steps in the pipeline
preprocessing_steps = [
    ('column_name_fixer', FixColumnNames()),
    ('calculate_flight_distance', CalculateFlightDistance()),
    ('add_additional_flight_data_features', AddAdditionalFlightDataFeatures()),
    ('drop_columns', DropColumns(columns_to_drop)),
    ('encode_labels', LabelEncoderTransformer(categorical_columns))

]

# Create the pipeline
preprocessing_pipeline = Pipeline(steps=preprocessing_steps)
# df_processed contains the preprocessed data
df_processed = preprocessing_pipeline.fit_transform(df)

In [20]:
# Splitting data
# X = df[['arrstn','depstn','std','sta']]
X = df_processed.drop(['target'],axis=1)
y = df_processed['target'] 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [22]:
X_train.head()

Unnamed: 0,datop,depstn,arrstn,dep_country,dep_elevation,dep_lat,dep_lon,arr_country,arr_elevation,arr_lat,...,sta_time,elevation_dif,flight_time_in_min,average_flight_speed_km_h,international_flight,airline_code,year,month,day,season
76952,20180801,81,74,51,9.0,35.758099,10.7547,44,1273.0,46.223701,...,154000,1264.0,120.0,602,1,14,2018,8,1,2
39411,20170524,123,110,51,22.0,36.851002,10.2272,17,505.0,48.5383,...,153000,483.0,140.0,564,1,14,2017,5,24,1
18120,20160219,81,87,51,9.0,35.758099,10.7547,17,12.0,43.658401,...,84500,3.0,105.0,530,1,14,2016,2,19,3
100964,20181026,37,41,51,19.0,33.875,10.7755,11,147.0,51.289501,...,93500,128.0,180.0,654,1,14,2018,10,26,0
44997,20170412,79,119,17,2001.0,40.4936,-3.56676,48,22.0,36.851002,...,193500,-1979.0,130.0,584,1,14,2017,4,12,1


In [23]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)


RMSE: 128.43504471511554


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# Save the model to a file
model_filename = './data/api_test_logistic_regression_model.joblib'
joblib.dump(lr, model_filename)

['./data/api_test_logistic_regression_model.joblib']