In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import os

In [2]:
# Read data
train_df = pd.read_excel('data/Data_Train.xlsx')
test_df = pd.concat([pd.read_excel('data/Test_set.xlsx'),
                     pd.read_excel('data/Sample_submission.xlsx')],
                    axis=1)

# Drop duplicates
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)
train_df.dropna(inplace = True)

y_train = train_df[['Price']]
train_df.drop('Price', axis = 1, inplace = True)

y_test = test_df[['Price']]
test_df.drop('Price', axis = 1, inplace = True)

# Feature Engineering: Extract Day Name from the Date of Journey
train_df['Week_Day'] = pd.to_datetime(train_df['Date_of_Journey'], format='%d/%m/%Y').dt.day_name()
test_df['Week_Day'] = pd.to_datetime(test_df['Date_of_Journey'], format='%d/%m/%Y').dt.day_name()

# Transforming Duration Column
# Changing Duration feature from HH:MM to total minutes
hour = pd.to_numeric(train_df['Duration'].str.replace(r'\D+', ' ', regex=True).str.split(' ').str[0])*60
mins = pd.to_numeric(train_df['Duration'].str.replace(r'\D+', ' ', regex=True).str.split(' ').str[1])
# some rows dont have mins. hence will become NaN if add hr and min columns tgt. 
train_df['Duration'] = np.where(mins.isnull(), hour, hour + mins )

hour = pd.to_numeric(test_df['Duration'].str.replace(r'\D+', ' ', regex=True).str.split(' ').str[0])*60
mins = pd.to_numeric(test_df['Duration'].str.replace(r'\D+', ' ', regex=True).str.split(' ').str[1])
test_df['Duration'] = np.where(mins.isnull(), hour, hour + mins )

# Define categorical and numerical columns
categorical_columns = ['Airline', 'Week_Day']
ordinal_categorical_columns = ['Total_Stops']  # Only Total_Stops is ordinal categorical
numerical_columns = ['Duration']

# Create transformers for preprocessing
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder())
])

ordinal_categorical_transformer = Pipeline([
    ('ordinal', OrdinalEncoder(categories=[['non-stop', '1 stop', '2 stops', '3 stops', '4 stops']]))
])

numerical_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# ColumnTransformer for different types of columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns),
        ('ordinal_cat', ordinal_categorical_transformer, ordinal_categorical_columns),
        ('num', numerical_transformer, numerical_columns)
    ],
    remainder='drop'  
)

# Transform the test data using the pipeline
transformed_train_df = preprocessor.fit_transform(train_df)
transformed_train_df = pd.DataFrame(transformed_train_df.toarray(), columns=preprocessor.get_feature_names_out())


In [3]:
transformed_test_df = preprocessor.transform(test_df)
transformed_test_df = pd.DataFrame(transformed_test_df.toarray(), columns=preprocessor.get_feature_names_out())


In [4]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'Elastic Net': ElasticNet(),
    'Support Vector Regressor': SVR(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Naive Bayes Regressor': GaussianNB()
}

In [5]:
def model_evaluate(models, X_train, X_test, y_train, y_test):
    report = {}
    import numpy as np
    y_train = y_train.values.ravel()
    y_test = y_test.values.ravel()

    for model in models:
        model_name = model
        model_obj = models[model_name]
        model_obj.fit(X_train, y_train)
        y_predict = model_obj.predict(X_test)
        mae = mean_absolute_error(y_test, y_predict)
        report[model_name] = mae
    return report

In [6]:
accuracy_report = model_evaluate(models, transformed_train_df, transformed_test_df, y_train, y_test)

In [7]:
accuracy_report

{'Linear Regression': 11127.289217521527,
 'Lasso': 11127.230965342907,
 'Ridge': 11120.45775798096,
 'Elastic Net': 11088.592297907571,
 'Support Vector Regressor': 12021.337128630083,
 'Decision Tree Regressor': 11329.1286944678,
 'Random Forest Regressor': 11257.373703491046,
 'Naive Bayes Regressor': 10660.581804567577}

In [8]:
soreted_report = list(sorted(accuracy_report.items(), key = lambda x: x[1]))

In [9]:
best_model = list(soreted_report)[0][0]
best_model

'Naive Bayes Regressor'

In [10]:
model_obj = models[best_model]
model_obj

In [12]:
train_df['Week_Day'].unique()


array(['Sunday', 'Wednesday', 'Friday', 'Monday', 'Tuesday', 'Saturday',
       'Thursday'], dtype=object)

In [2]:
import pandas as pd

In [6]:
day = (pd.to_datetime('23/03/2021', format = '%d/%m/%Y')).strftime('%A')

In [7]:
day

'Tuesday'