The Flight Fare Prediction project aims to develop a machine learning model that can predict flight ticket prices based on various features such as airline, source, destination, date of journey, and flight duration. This project utilizes regression techniques to forecast continuous values (i.e., flight prices) and involves several steps, including data preprocessing, exploratory data analysis (EDA), model training, and evaluation.




In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_excel("Data_Train.xlsx")  # Make sure to provide the correct path to your dataset

# Print the first few rows of the dataset to understand its structure
print("Initial Data:")
print(data.head())

# Print the columns to check their names
print("Columns in the DataFrame:", data.columns)

# Data Cleaning and Preprocessing
data.dropna(inplace=True)  # Drop rows with missing values

# Strip any leading/trailing whitespace from column names
data.columns = data.columns.str.strip()

# Convert 'Date_of_Journey' to datetime if it exists
if 'Date_of_Journey' in data.columns:
    data['Journey_day'] = pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y', dayfirst=True).dt.day
    data['Journey_month'] = pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y', dayfirst=True).dt.month

# Convert 'Dep_Time' and 'Arrival_Time' to datetime if they exist
if 'Dep_Time' in data.columns:
    data['Dep_hour'] = pd.to_datetime(data['Dep_Time'], format='%H:%M', errors='coerce').dt.hour
    data['Dep_min'] = pd.to_datetime(data['Dep_Time'], format='%H:%M', errors='coerce').dt.minute

if 'Arrival_Time' in data.columns:
    data['Arrival_hour'] = pd.to_datetime(data['Arrival_Time'], format='%H:%M', errors='coerce').dt.hour
    data['Arrival_min'] = pd.to_datetime(data['Arrival_Time'], format='%H:%M', errors='coerce').dt.minute

# Handle missing values in 'Total_Stops'
# Option 1: Drop rows with missing values in 'Total_Stops'
data.dropna(subset=['Total_Stops'], inplace=True)

# Option 2: Fill missing values with the mode (most common value)
# mode_value = data['Total_Stops'].mode()[0]
# data['Total_Stops'].fillna(mode_value, inplace=True)

# Prepare features and target variable
X = data.drop(['Price', 'Date_of_Journey', 'Dep_Time', 'Arrival_Time'], axis=1, errors='ignore')
y = data['Price']

# Check for missing values in features
print("Missing values in features before train-test split:")
print(X.isnull().sum())

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model Evaluation
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R² Score: {r2:.2f}")
print(f"Mean Squared Error: {mse:.2f}")

# Optional: Feature Importance
importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance_df)

Initial Data:
   Unnamed: 0  Total_Stops  Price  Journey_day  Journey_month  Dep_hour  \
0           0            0   3897           24              3        22   
1           1            2   7662            1              5         5   
2           2            2  13882            9              6         9   
3           3            1   6218           12              5        18   
4           4            1  13302            1              3        16   

   Dep_min  Arrival_hour  Arrival_min  Duration_hours  ...  \
0       20             1           10               2  ...   
1       50            13           15               7  ...   
2       25             4           25              19  ...   
3        5            23           30               5  ...   
4       50            21           35               4  ...   

   Airline_Vistara Premium economy  Source_Chennai  Source_Delhi  \
0                            False           False         False   
1                         