In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('flights_updated.csv')
df

In [None]:
df.Airline.value_counts()
df['Departure City'].value_counts()

In [None]:
df.replace('N/A', np.nan, inplace=True)
df.isnull().sum()
df.drop(columns=['Flight Number', 'Return Date'], inplace=True, errors='ignore')
df.head()


In [None]:
# ensure both price and duration are numerical values
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
print(df.head())

In [None]:
import pandas as pd
from datetime import datetime

# Define the time categorization function
def get_time_of_day(time_str):
    time = datetime.strptime(time_str, "%Y-%m-%d %H:%M")
    hour = time.hour
    
    if 0 <= hour < 7:
        return "early morning"
    elif 7 <= hour < 12:
        return "morning"
    elif 12 <= hour < 17:
        return "afternoon"
    elif 17 <= hour < 20:
        return "evening"
    else:
        return "night"

# Load the preprocessed dataset
df = pd.read_csv('flights_updated.csv')
print(df.head())


# Drop the old time columns if not needed
df.drop(columns=['Departure Date', 'Arrival Time', 'Departure Time', 'Return Date'], inplace=True, errors='ignore')

# Save the updated DataFrame
df.to_csv('flights_updated2.csv', index=False)

# Display the updated DataFrame
df.head()


In [None]:
# use one-hot encoding for categorical variables like Airline and departure/arrival cities
df = df.drop(columns=['Flight Number'])
df = df.join(pd.get_dummies(df['Airline'], prefix='airline')).drop('Airline', axis=1)
df = df.join(pd.get_dummies(df['Departure City'], prefix='source')).drop('Departure City', axis=1)
df = df.join(pd.get_dummies(df['Arrival City'], prefix='dest')).drop('Arrival City', axis=1)
df = df.join(pd.get_dummies(df['Arrival Time of Day'], prefix='arrival')).drop('Arrival Time of Day', axis=1)
df = df.join(pd.get_dummies(df['Departure Time of Day'], prefix='departure')).drop('Departure Time of Day', axis=1)
                                                               
print(df.head())

In [None]:

from sklearn.preprocessing import MinMaxScaler

if 'Departure Date' in df.columns:
    df['Departure Hour'] = pd.to_datetime(df['Departure Date'], errors='coerce').dt.hour
if 'Arrival Time' in df.columns:
    df['Arrival Hour'] = pd.to_datetime(df['Arrival Time'], errors='coerce').dt.hour

df.drop(columns=['Departure Time', 'Arrival Time', 'Departure Date', 'Return Date', 'Flight Number'], inplace=True, errors='ignore')

df


In [None]:
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

print(df.head())
#encode airline and cities

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# x is all features except price and y is target feature price
X = df.drop(columns=['Price'])
y = df['Price']
print(X.dtypes)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create neural network with dense layers 
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # Increase neurons   model.add(Dense(64, activation='relu'))  # Add more layers
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))  # Output layer
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
history = model.fit(X_train, y_train, epochs=75, batch_size=32, validation_split=0.2)

y_pred_nn = model.predict(X_test)
# evaluate error metrics from test and predicted data
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f"Neural Network - Mean Squared Error: {mse_nn:.2f}, R2 Score: {r2_nn:.2f}")





In [None]:
print(df.head())

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.show()