In [None]:
import pandas as pd
import seaborn as sns   
import matplotlib.pyplot as plt 
import xgboost as xgb 
import pickle 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, silhouette_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.cluster import KMeans

# Load and preprocess the data
fly = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Projects/Flight Delay Prediction/df_EDA.csv')
fly.head(3)

# Drop unnecessary columns
fly = fly.drop(['Month_Str', 'DayOfWeek_Str', 'ArrDelay', 'FlightDate', 'Unnamed: 0', 'Quarter','DayofMonth'], axis=1)

# Define functions for handling categorical values
def plane(value):
    if value not in ['American Airlines', 'Delta Airlines', 'Southwest Airlines', 'United Airlines']:
        return None
    else:
        return value

def heli(value):
    if value not in ['Chicago, IL', 'Atlanta, GA', 'New York, NY', 'Denver, CO', 'Dallas/Fort Worth, TX']:
        return None
    else:
        return value

def ship(value):
    if value not in ['Chicago, IL', 'Atlanta, GA', 'New York, NY', 'Denver, CO', 'Dallas/Fort Worth, TX']:
        return None
    else:
        return value

# Apply the functions to the relevant columns
fly['Airlines'] = fly['Airlines'].apply(plane)
fly['OriginCityName'] = fly['OriginCityName'].apply(heli)
fly['DestCityName'] = fly['DestCityName'].apply(ship)

# Drop rows with NaN values
fly = fly.dropna()

# Define numeric columns
def refine_dep_delay(value):
    if value < 0:
        return None 
    else:
        return value

fly['DepDelay'] = fly['DepDelay'].apply(refine_dep_delay)

# Drop rows with NaN values again if any
fly = fly.dropna()

# Define features and target variable
categorical_columns = ['Airlines', 'OriginCityName', 'DestCityName']
numeric_columns = ['Month', 'DayOfWeek', 'DepDelay', 'AirTime', 'Distance']
target_column = 'Flight_Status'

X = fly[categorical_columns + numeric_columns]
y = fly[target_column]

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),  # Standardize numeric columns
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)  # One-hot encode categorical columns
    ])

X_preprocessed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Train the model with K-Fold Cross-Validation
cv_results = cross_validate(xgb.XGBClassifier(eval_metric='logloss'), X_train, y_train, cv=5, scoring='accuracy', return_train_score=True)

# Train the XGBoost Classifier on the full training data
model = xgb.XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

# Print cross-validation results
print(f"Cross-Validation Results:")
print(f"Mean Training Accuracy: {cv_results['train_score'].mean():.3f}")
print(f"Mean Validation Accuracy: {cv_results['test_score'].mean():.3f}")

# Clustering with K-Means
kmeans = KMeans(n_clusters=5, random_state=42)
X_clustering = preprocessor.transform(X)  # Apply the same preprocessing to clustering data
clusters = kmeans.fit_predict(X_clustering)

# Add cluster labels to the original data for visualization
fly['Cluster'] = clusters

# Calculate Silhouette Score
silhouette_avg = silhouette_score(X_clustering, clusters)
print(f"Silhouette Score: {silhouette_avg:.3f}")

# Calculate the correlation matrix
corr_matrix = fly[numeric_columns].corr()
plt.figure(figsize=(10,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Plot clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=fly['Distance'], y=fly['AirTime'], hue=fly['Cluster'], palette='tab10', marker='o')
plt.title('Clusters of Flights')
plt.xlabel('Distance')
plt.ylabel('AirTime')
plt.legend(title='Cluster')
plt.show()

# Save the model using pickle
model_filename = 'Delay.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

with open('Pre-process.pkl', 'wb') as preprocessor_file:
    pickle.dump(preprocessor, preprocessor_file)
