In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, label_binarize
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, make_scorer
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from scipy.stats import randint
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from docx import Document
import json
import os

In [None]:
df = pd.read_csv('Data/trainer2.csv')

df['delayed'] = df['departure.delay'].apply(lambda x: 1 if x > 0 else 0)
correlColumns = [
    'Temperature (°F) Max', 'flight.number',
	'Temperature (°F) Avg', 'Temperature (°F) Min', 
    'Dew Point (°F) Max', 'Dew Point (°F) Avg', 'Dew Point (°F) Min', 'Humidity (%) Max', 
    'Humidity (%) Avg', 'Humidity (%) Min', 'Wind Speed (mph) Max', 'Wind Speed (mph) Avg', 
    'Wind Speed (mph) Min', 'Pressure (in) Max', 'Pressure (in) Avg', 'Pressure (in) Min',
    'hour', 'day', 'month', 'year'
]
categorical = ['type', 'status', 'departure.iataCode', 'departure.icaoCode', 'arrival.iataCode', 'arrival.icaoCode']

encoder = OneHotEncoder(drop = 'first', handle_unknown = 'ignore', sparse_output = False)
encodedTrain = encoder.fit_transform(df[categorical])
encData = pd.DataFrame(encodedTrain, columns=encoder.get_feature_names_out(categorical))

df = df.drop(columns=categorical)
finalData = pd.concat([df[correlColumns + ['delayed']], encData], axis=1)
finalData = finalData.dropna(subset=['delayed'])

class0 = finalData[finalData['delayed'] == 0]
class1 = finalData[finalData['delayed'] == 1]
mapping = resample(class1, replace = True, n_samples = len(class0), random_state = 42)

balanced = pd.concat([class0, mapping])
balanced = balanced.sample(frac=1, random_state=42).reset_index(drop=True)

y = balanced['delayed']
X = balanced.drop(columns=['delayed'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))

param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best Params:", grid_search.best_params_)