In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
from itertools import cycle
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

file_names = ['batch2_combined.csv']

df1 = pd.DataFrame()

for i, file_name in enumerate(file_names):
    if i == 0:
        df1 = pd.read_csv(file_name)

selected_df2 = df1[['Vert. Speed', 'Groundspeed', 'Altitude(AGL)']]
selected_df2

selected_df2 = selected_df2.astype(str)

def clean_and_convert(value):
    try:
        if value:  # Check if the string is not empty
            return float(value)
        else:
            return None
    except (ValueError, TypeError):
        return None

selected_df2['Vert. Speed'] = selected_df2['Vert. Speed'].apply(clean_and_convert)
selected_df2['Groundspeed'] = selected_df2['Groundspeed'].apply(clean_and_convert)
selected_df2['Altitude(AGL)'] = selected_df2['Altitude(AGL)'].apply(clean_and_convert)

columns_to_check = ['Vert. Speed', 'Groundspeed', 'Altitude(AGL)']
missing_value_counts = {}
for column in columns_to_check:
    missing_count = selected_df2[column].isnull().sum()
    missing_value_counts[column] = missing_count

for column, count in missing_value_counts.items():
    print(f"Missing values in '{column}': {count}")


duplicate_rows = selected_df2[selected_df2.duplicated()]
print("Number of duplicated rows:", len(duplicate_rows))

original_df = selected_df2.copy()
selected_df2.dropna(subset=columns_to_check, inplace=True)

dropped_rows = len(original_df) - len(selected_df2)
print("Number of rows dropped:", dropped_rows)

def classify_maneuver(row):
    if (row['Groundspeed'] < 18) and (row['Groundspeed'] > 0.6)  and (row['Altitude(AGL)'] >= 25) and (row['Altitude(AGL)'] <= 100):
        return 'Air Taxi'
    elif (row['Vert. Speed'] > 90) and (row['Altitude(AGL)']>=100):
        return 'Climb/Ascent'
    elif (row['Vert. Speed'] < -90) and (row['Altitude(AGL)']>=100):
        return 'Descent'
    elif (row['Groundspeed'] <=18) and (row['Groundspeed'] > 0.6) and (row['Altitude(AGL)']>=2) and (row['Altitude(AGL)']<=25):
        return 'Hover Taxi'
    else:
        return 'Unknown'

selected_df2['Maneuver'] = selected_df2.apply(classify_maneuver, axis=1)
print(selected_df2[['Groundspeed', 'Vert. Speed', 'Altitude(AGL)', 'Maneuver']])

air_taxi_count = 0
hover_taxi_count = 0
climb_count = 0
descent_count = 0

for index, row in maneuver_counts.iterrows():
    maneuver = row['Maneuver']
    count = row['Count']
    if 'Air Taxi' in maneuver:
        air_taxi_count += count
    elif 'Hover Taxi' in maneuver:
        hover_taxi_count += count
    elif 'Climb' in maneuver:
        climb_count += count
    elif 'Descent' in maneuver:
        descent_count += count

total_rows = sum([air_taxi_count, hover_taxi_count, climb_count, descent_count])
print(f'Air Taxi Count: {air_taxi_count}')
print(f'Hover Taxi Count: {hover_taxi_count}')
print(f'Climb Count: {climb_count}')
print(f'Descent Count: {descent_count}')
print(f'Total Rows: {total_rows}')

def is_close_to_zero(value, tolerance=1e-6):
    return abs(value) < tolerance

def identify_phase(Vert_Speed, Groundspeed, Altitude_AGL):
    if is_close_to_zero(Vert_Speed) and Groundspeed < 0.6 and 0 <= Altitude_AGL < 2:
        return "Standing"
    elif is_close_to_zero(Vert_Speed) and 0.6 <= Groundspeed < 20 and 0 <= Altitude_AGL < 2:
        return "Surface Taxi"
    elif is_close_to_zero(Vert_Speed) and 0.6 <= Groundspeed < 30 and 2 <= Altitude_AGL < 25:
        return "Hover Taxi"
    elif is_close_to_zero(Vert_Speed) and 0.6 <= Groundspeed < 50 and 25 <= Altitude_AGL < 100:
        return "Air Taxi"
    elif -90 <= Vert_Speed <= 90 and Groundspeed <= 0.6 and Altitude_AGL < 2:
        return "Hover In Ground Effect"
    elif -90 <= Vert_Speed <= 90 and Groundspeed <= 0.6 and Altitude_AGL >= 100:
        return "Hover"
    elif Vert_Speed > 90 and Groundspeed <= 0.6 and Altitude_AGL < 100:
        return "Hover Lift"
    elif Vert_Speed < -90 and Groundspeed <= 0.6 and Altitude_AGL < 100:
        return "Hover Descent"
    elif Vert_Speed > 90 and Altitude_AGL >= 100:
        return "Climb"
    elif -90 <= Vert_Speed <= 90 and Groundspeed > 0.6 and Altitude_AGL >= 100:
        return "Cruise"
    elif Vert_Speed < -90 and Altitude_AGL >= 100:
        return "Descent"
    else:
        return "LandingOrTakeOff"

selected_df2['Phase'] = selected_df2.apply(lambda row: identify_phase(row['Vert. Speed'], row['Groundspeed'], row['Altitude(AGL)']), axis=1)
print(selected_df2[['Vert. Speed', 'Groundspeed', 'Altitude(AGL)','Phase']])


scaler = MinMaxScaler()
features_to_scale = ['Vert. Speed', 'Groundspeed', 'Altitude(AGL)']
selected_df2[features_to_scale] = scaler.fit_transform(selected_df2[features_to_scale])


X = selected_df2[['Vert. Speed', 'Groundspeed', 'Altitude(AGL)']]
y = selected_df2['Phase']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifiers = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    print(f"Classifier: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_rep)
    print("=" * 40)



for name, clf in classifiers.items():
    y_pred = clf.predict(X_test)

    misclassified = X_test[y_pred != y_test]
    misclassified_actual = y_test[y_pred != y_test]
    misclassified_pred = y_pred[y_pred != y_test]

    misclassified = misclassified.reset_index(drop=True)
    misclassified_actual = misclassified_actual.reset_index(drop=True)
    misclassified_pred = pd.Series(misclassified_pred).reset_index(drop=True)

    sample_size = min(1000, len(misclassified))
    misclassified_sample = misclassified.sample(n=sample_size)

    misclassified_counts = misclassified_sample.groupby(['Groundspeed', 'Vert. Speed']).size().reset_index(name='counts')

    sns.set_palette('dark')

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=misclassified_counts, x='Groundspeed', y='Vert. Speed', size='counts', hue='counts', alpha=0.6, legend=False, color = 'black')

    plt.xlabel('Groundspeed')
    plt.ylabel('Vertical Speed')
    plt.title(f'Misclassifications of {name}')
    plt.show()


classifiers = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

classifier_names = []
accuracies = []

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    classifier_names.append(name)
    accuracies.append(accuracy)

plt.figure(figsize=(8, 6))
plt.bar(classifier_names, accuracies, color='skyblue')
plt.xlabel('Classifier')
plt.ylabel('Accuracy')
plt.title('Classifier Performance Comparison')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


classes_to_include = [0, 1, 2, 3, 4, 5, 6]
y_bin = label_binarize(y, classes=np.unique(y))
n_classes = y_bin.shape[1]

logreg_clf = OneVsRestClassifier(LogisticRegression(random_state=42))
X_train, X_test, y_train_bin, y_test_bin = train_test_split(X, y_bin, test_size=0.2, random_state=42)

logreg_clf.fit(X_train, y_train_bin)

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    y_score = logreg_clf.predict_proba(X_test)[:, i]
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score)
    roc_auc[i] = auc(fpr[i], tpr[i])
y_score = logreg_clf.predict_proba(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

for i in range(n_classes):
    print(f"Class {i} AUC: {roc_auc[i]:.2f}")
print(f"Micro-average AUC: {roc_auc['micro']:.2f}")


# Plot  ROC curves
plt.figure(figsize=(10, 8))

colors = cycle(['blue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic to Multi-class')
plt.legend(loc="lower right")
plt.show()

selected_df2['System UTC Time'] = pd.to_datetime(selected_df2['System UTC Time'])
selected_df2 = selected_df2.groupby('Phase').apply(lambda x: x.sort_values('System UTC Time')).reset_index(drop=True)
selected_df2['Time Difference'] = selected_df2.groupby('Phase')['System UTC Time'].diff()
event_threshold = pd.Timedelta(minutes=5)
selected_df2['Event'] = (selected_df2['Time Difference'] > event_threshold).astype(int)
print(selected_df2.head())

def map_maneuver_to_event(row):
    if row['Event'] == 1:
        maneuver = row['Maneuver']
        if 'Air Taxi' in maneuver:
            return 'Air Taxi Event'
        elif 'Hover Taxi' in maneuver:
            return 'Hover Taxi Event'
        elif 'Climb' in maneuver:
            return 'Climb Event'
        elif 'Descent' in maneuver:
            return 'Descent Event'
    return None

selected_df2['Event Type'] = selected_df2.apply(map_maneuver_to_event, axis=1)
print(selected_df2[['Groundspeed', 'Vert. Speed', 'Altitude(AGL)', 'Date', 'System UTC Time', 'Maneuver', 'Event Type']])

# Neural Network

from keras.models import Sequential
from keras.layers import Dense, Dropout

batch_size = 32
n_features = len(selected_df2.columns) - 3

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(n_features,), batch_input_shape=(batch_size, n_features)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
num_classes = 7
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Accuracy of NN and Loss

X = selected_df2[['Vert. Speed', 'Groundspeed', 'Altitude(AGL)']]
y = selected_df2['Phase']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)
y_encoded = to_categorical(y_encoded, num_classes)

X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

model = Sequential()
n_features = X_train.shape[1]

model.add(Dense(64, activation='relu', input_shape=(n_features,)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss Over Epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

plt.show()

final_train_accuracy = history.history['accuracy'][-1]
final_val_accuracy = history.history['val_accuracy'][-1]
print(f"Final Training Accuracy: {final_train_accuracy:.4f}")
print(f"Final Validation Accuracy: {final_val_accuracy:.4f}")