In [12]:
!pip install pandas joblib scikit-learn



Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/fe/6b/db949ed5ac367987b1f250f070f340b7715d22f0c9c965bdf07de6ca75a3/scikit_learn-1.3.2-cp312-cp312-win_amd64.whl.metadata
  Downloading scikit_learn-1.3.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.5.0 from https://files.pythonhosted.org/packages/c6/a1/357e4cd43af2748e1e0407ae0e9a5ea8aaaa6b702833c81be11670dcbad8/scipy-1.11.4-cp312-cp312-win_amd64.whl.metadata
  Downloading scipy-1.11.4-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.4 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.4 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.4 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.4 kB ? eta -:--:--
     -------------------------------- ----- 51.2/60.4 kB 518.5 kB/


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import pandas as pd
import joblib
import os

# Set the working directory to the root of your project
os.chdir('C:/Users/Saadiq Mahmood/OneDrive/UA92/FlightPricePrediction')

# Load the trained model and encoders
model = joblib.load('/data/random_forest_model.pkl')
label_encoders = joblib.load('/data/label_encoder.pkl')
scaler = joblib.load('/data/min_max_scaler.pkl')
encoder_mappings = joblib.load('/data/label_encoder_mappings.pkl')


def predict_flight_price(input_data):
    try:
        # Preprocess input data
        # Apply label encoding to categorical features.
        for column in label_encoders:
            input_data[column] = label_encoders[column].transform(input_data[column])

        # Apply scaling to numerical features.
        # Assumes 'duration' and 'days_left' are the numerical features to be scaled.
        input_data[['duration', 'days_left']] = scaler.transform(input_data[['duration', 'days_left']])

        # Make prediction using the preprocessed data.
        prediction = model.predict(input_data)
        return prediction
    except ValueError as e:
        # Print the error message if an exception occurs.
        print("Error during prediction:", e)
        # Return None or a default value in case of an error.
        return None  # or some default value

# Define the feature columns expected by the model
features = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class', 'duration', 'days_left']

# Test the function with some sample data
# Create a DataFrame with a single row of sample data.
# This data should match the format and types expected by the model.
sample_data = pd.DataFrame([{
    'airline': 'Vistara',
    'flight': 'UK-951',
    'source_city': 'Delhi',
    'departure_time': 'Afternoon',
    'stops': 'zero',
    'arrival_time': 'Evening',
    'destination_city': 'Mumbai',
    'class': 'Economy',
    'duration': 2.17,  #In hours
    'days_left': 1  # Days until the flight
}], columns=features)

# Predict the price using the sample data
predicted_price = predict_flight_price(sample_data)
print("Predicted Price:", predicted_price)


Predicted Price: [6656.52]


In [13]:
import pandas as pd
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

class CustomLabelEncoder:
    def __init__(self):
        self.label_map = {}

    def fit(self, data):
        unique_labels = set(data)
        self.label_map = {label: idx for idx, label in enumerate(unique_labels)}
        self.label_map['unknown'] = len(unique_labels)  # Handle unknown labels

    def transform(self, data):
        return [self.label_map.get(label, self.label_map['unknown']) for label in data]

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

# Load the trained model and encoders
model = joblib.load('holidaytest/data/random_forest_model.pkl')
label_encoders = joblib.load('holidaytest/data/label_encoder.pkl')
scaler = joblib.load('holidaytest/data/min_max_scaler.pkl')
encoder_mappings = joblib.load('holidaytest/data/label_encoder_mappings.pkl')

# Load test data
X_test = pd.read_csv('holidaytest/data/X_test.csv')
y_test = pd.read_csv('holidaytest/data/y_test.csv')

# Make predictions on the test data using the trained model
y_pred = model.predict(X_test)

# Evaluate the model using various metrics
# Mean Absolute Error (MAE): Average absolute difference between actual and predicted values
mae = mean_absolute_error(y_test, y_pred)
# Mean Squared Error (MSE): Average of the squares of the errors or deviations
mse = mean_squared_error(y_test, y_pred)
# Root Mean Squared Error (RMSE): Square root of MSE, provides error in the same units as the data
rmse = np.sqrt(mse)
# R-squared (R²): Proportion of variance in the dependent variable predictable from the independent variables
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Model Evaluation Metrics:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error(RMSE): ", rmse) 
print("R-squared (R²):", r2)

Model Evaluation Metrics:
Mean Absolute Error (MAE): 902.9512318267175
Mean Squared Error (MSE): 5871784.485875207
Root Mean Squared Error(RMSE):  2423.1765280051736
R-squared (R²): 0.9886091437340363


In [18]:
import pandas as pd

# Load your dataset
data = pd.read_csv('data/flight_dataset.csv')

# Drop unnecessary columns
if 'Unnamed: 0' in data.columns:
    data.drop('Unnamed: 0', axis=1, inplace=True)

# Add a new column for holiday indicator
data['is_holiday'] = 0

# Define the holiday dates in terms of days_left
# Assuming February 11th is day 50 and March 31st is day 1
holiday_dates = {
    'Manipur Holiday': range(17, 20)  # March 14th to 16th
    # Add more holidays as needed
}

# Update the holiday indicator
for holiday, days in holiday_dates.items():
    data.loc[data['days_left'].isin(days), 'is_holiday'] = 1

# Display the first few rows of the DataFrame to see the new column
print(data.sample(5))



          airline  flight source_city departure_time stops arrival_time  \
231337    Vistara  UK-958      Mumbai      Afternoon   one      Evening   
74720     Vistara  UK-970      Mumbai        Morning   one      Evening   
197438  Air_India  AI-430     Chennai        Morning   one    Afternoon   
224753    Vistara  UK-995       Delhi        Morning   one        Night   
246763  Air_India  AI-610   Bangalore        Evening   one        Night   

       destination_city     class  duration  days_left  price  is_holiday  
231337        Bangalore  Business      6.42         17  62560           1  
74720         Hyderabad   Economy     11.17         32   4099           0  
197438        Bangalore   Economy     26.58         45   6308           0  
224753          Chennai  Business     12.75         46  57992           0  
246763            Delhi  Business     26.08         26  42521           0  


In [19]:
import pandas as pd
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

class CustomLabelEncoder:
    def __init__(self):
        self.label_map = {}

    def fit(self, data):
        unique_labels = set(data)
        self.label_map = {label: idx for idx, label in enumerate(unique_labels)}
        self.label_map['unknown'] = len(unique_labels)  # Handle unknown labels

    def transform(self, data):
        return [self.label_map.get(label, self.label_map['unknown']) for label in data]

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

# Load the trained model and encoders
model = joblib.load('holidaytest/data/random_forest_model.pkl')
label_encoders = joblib.load('holidaytest/data/label_encoder.pkl')
scaler = joblib.load('holidaytest/data/min_max_scaler.pkl')
encoder_mappings = joblib.load('holidaytest/data/label_encoder_mappings.pkl')

# Load holiday test data
X_test = pd.read_csv('holidaytest/data/X_test.csv')
y_test = pd.read_csv('holidaytest/data/y_test.csv')

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model with holiday data
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Evaluation Metrics:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error(RMSE): ", rmse) 
print("R-squared (R²):", r2)

Model Evaluation Metrics:
Mean Absolute Error (MAE): 901.3443370743238
Mean Squared Error (MSE): 5897951.150950245
Root Mean Squared Error(RMSE):  2428.5697747749073
R-squared (R²): 0.9885583822114453


In [21]:
import joblib

# Load the encoder mappings
encoder_mappings = joblib.load('backend/label_encoder_mappings.pkl')

# Print the mappings for each categorical feature
for feature, mapping in encoder_mappings.items():
    print(f"Feature: {feature}")
    for label, encoded_value in mapping.items():
        print(f"  {label}: {encoded_value}")
    print("\n")


Feature: airline
  AirAsia: 0
  Vistara: 1
  Air_India: 2
  Indigo: 3
  GO_FIRST: 4
  SpiceJet: 5
  unknown: 6


Feature: source_city
  Bangalore: 0
  Chennai: 1
  Delhi: 2
  Kolkata: 3
  Hyderabad: 4
  Mumbai: 5
  unknown: 6


Feature: departure_time
  Morning: 0
  Early_Morning: 1
  Night: 2
  Late_Night: 3
  Afternoon: 4
  Evening: 5
  unknown: 6


Feature: stops
  zero: 0
  one: 1
  two_or_more: 2
  unknown: 3


Feature: destination_city
  Bangalore: 0
  Chennai: 1
  Delhi: 2
  Kolkata: 3
  Hyderabad: 4
  Mumbai: 5
  unknown: 6


Feature: class
  Business: 0
  Economy: 1
  unknown: 2


