In [None]:
import pandas as pd

# Upload the file manually if not already uploaded
from google.colab import files
uploaded = files.upload()

# Read the Excel file
df = pd.read_excel('Data_Train.xlsx')
df.head()

In [None]:
print("Column names:", df.columns.tolist())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# Drop rows with missing values (simplest way to clean for now)
df.dropna(inplace=True)

# Extract day and month from 'Date_of_Journey'
df['Journey_day'] = pd.to_datetime(df['Date_of_Journey']).dt.day
df['Journey_month'] = pd.to_datetime(df['Date_of_Journey']).dt.month
df.drop(['Date_of_Journey'], axis=1, inplace=True)

# Extract hour and minute from 'Dep_Time'
df['Dep_hour'] = pd.to_datetime(df['Dep_Time']).dt.hour
df['Dep_min'] = pd.to_datetime(df['Dep_Time']).dt.minute
df.drop(['Dep_Time'], axis=1, inplace=True)

# Extract hour and minute from 'Arrival_Time'
df['Arrival_hour'] = pd.to_datetime(df['Arrival_Time']).dt.hour
df['Arrival_min'] = pd.to_datetime(df['Arrival_Time']).dt.minute
df.drop(['Arrival_Time'], axis=1, inplace=True)

# Convert Duration to total minutes
def duration_to_minutes(duration):
    duration = duration.strip().replace("h", "h ").replace("m", "m ")
    h = 0
    m = 0
    for part in duration.split():
        if 'h' in part:
            h = int(part.replace('h', ''))
        elif 'm' in part:
            m = int(part.replace('m', ''))
    return h * 60 + m

df['Duration'] = df['Duration'].apply(duration_to_minutes)

# Drop 'Route' and 'Additional_Info' for simplicity
df.drop(['Route', 'Additional_Info'], axis=1, inplace=True)

# Encode categorical features
for col in ['Airline', 'Source', 'Destination', 'Total_Stops']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [None]:
# Features and target
X = df.drop('Price', axis=1)
y = df['Price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("R2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor()
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    results.append({
        'Model': name,
        'R2 Score': round(r2, 3),
        'RMSE': round(rmse, 2),
        'MAE': round(mae, 2)
    })

# Show results
results_df = pd.DataFrame(results)
print(results_df)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert results to DataFrame (if not already)
results_df = pd.DataFrame(results)

# Set the plot style
sns.set(style="whitegrid")

# Plot R2 Score
plt.figure(figsize=(10, 4))
sns.barplot(x="Model", y="R2 Score", data=results_df, palette="Blues_d")
plt.title("Model Comparison - R² Score (Higher is Better)")
plt.ylabel("R² Score")
plt.ylim(0, 1)
plt.show()

# Plot RMSE
plt.figure(figsize=(10, 4))
sns.barplot(x="Model", y="RMSE", data=results_df, palette="Oranges_d")
plt.title("Model Comparison - RMSE (Lower is Better)")
plt.ylabel("Root Mean Squared Error")
plt.show()

# Plot MAE
plt.figure(figsize=(10, 4))
sns.barplot(x="Model", y="MAE", data=results_df, palette="Greens_d")
plt.title("Model Comparison - MAE (Lower is Better)")
plt.ylabel("Mean Absolute Error")
plt.show()


In [None]:
# Sample manual input
manual_input = {
    'Airline': 'IndiGo',
    'Source': 'Delhi',
    'Destination': 'Cochin',
    'Total_Stops': '1 stop',
    'Journey_day': 24,
    'Journey_month': 5,
    'Dep_hour': 22,
    'Dep_min': 20,
    'Arrival_hour': 1,
    'Arrival_min': 10,
    'Duration': 170  # in minutes
}
# Step 1: Re-load your dataset to preserve original strings
df_raw = pd.read_excel('Data_Train.xlsx')  # load fresh to keep string labels
df_raw.dropna(inplace=True)

# Step 2: Re-fit LabelEncoders on original string values
airline_encoder = LabelEncoder()
source_encoder = LabelEncoder()
dest_encoder = LabelEncoder()
stops_encoder = LabelEncoder()

airline_encoder.fit(df_raw['Airline'])
source_encoder.fit(df_raw['Source'])
dest_encoder.fit(df_raw['Destination'])
stops_encoder.fit(df_raw['Total_Stops'])

# Step 3: Provide manual input
manual_input = {
    'Airline': 'Air India',
    'Source': 'Mumbai',
    'Destination': 'Hyderabad',
    'Total_Stops': 'non-stop',
    'Journey_day': 20,
    'Journey_month': 6,
    'Dep_hour': 14,
    'Dep_min': 30,
    'Arrival_hour': 16,
    'Arrival_min': 15,
    'Duration': 105  # Example: 1h 45m = 105 mins
}
# Step 4: Encode manual input
manual_input_encoded = {
    'Airline': airline_encoder.transform([manual_input['Airline']])[0],
    'Source': source_encoder.transform([manual_input['Source']])[0],
    'Destination': dest_encoder.transform([manual_input['Destination']])[0],
    'Total_Stops': stops_encoder.transform([manual_input['Total_Stops']])[0],
    'Journey_day': manual_input['Journey_day'],
    'Journey_month': manual_input['Journey_month'],
    'Dep_hour': manual_input['Dep_hour'],
    'Dep_min': manual_input['Dep_min'],
    'Arrival_hour': manual_input['Arrival_hour'],
    'Arrival_min': manual_input['Arrival_min'],
    'Duration': manual_input['Duration']
}

# Step 5: Convert to DataFrame and Predict
# Ensure input columns are in the same order and names as training data
input_df = pd.DataFrame([manual_input_encoded])
input_df = input_df[X_train.columns]  # ✅ match training feature order
predicted_price = model.predict(input_df)[0]

print(f"\n🎯 Predicted Flight Price: ₹{round(predicted_price, 2)}")
