In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['thesis_db']  # Replace with your database name
collection = db['research_data']  # Replace with your collection name

# Retrieve data from MongoDB
data = list(collection.find())

# Convert to DataFrame
df = pd.DataFrame(data)

# Drop the MongoDB '_id' field if it exists
if '_id' in df.columns:
    df = df.drop('_id', axis=1)

# Data Cleaning
df.fillna(df.mean(numeric_only=True), inplace=True)
df.drop_duplicates(inplace=True)
numeric_cols = df.select_dtypes(include=[float, int]).columns

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

for col in numeric_cols:
    df = remove_outliers_iqr(df, col)

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
label_encoder = LabelEncoder()
df['Class/ASD'] = label_encoder.fit_transform(df['Class/ASD'])
df_encoded = pd.get_dummies(df, drop_first=True)
df_selected = df_encoded.select_dtypes(include=[np.number])

# PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(df_selected)
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
pca_df.to_csv('pca_transformed_data.csv', index=False)

# Split the data into features and target
X = df_encoded.drop(columns=['Class/ASD'])
y = df['Class/ASD']

# Ensure the data is float32 for TensorFlow compatibility
X = X.astype('float32')
y = y.astype('float32')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape data for LSTM model (LSTM expects 3D input: samples, time steps, features)
X_train = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Define the LSTM model
def create_lstm_model(optimizer='adam', activation='relu', dropout_rate=0.2, neurons=50):
    model = Sequential()
    model.add(LSTM(neurons, activation=activation, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Hyperparameter tuning manually
param_grid = {
    'neurons': [50],
    'activation': ['relu'],
    'optimizer': ['adam'],
    'dropout_rate': [0.2],
    'batch_size': [32],
    'epochs': [5]  # Using fewer epochs for quicker tuning
}

best_accuracy = 0
best_params = {}
for neurons in param_grid['neurons']:
    for activation in param_grid['activation']:
        for optimizer in param_grid['optimizer']:
            for dropout_rate in param_grid['dropout_rate']:
                for batch_size in param_grid['batch_size']:
                    for epochs in param_grid['epochs']:
                        model = create_lstm_model(optimizer=optimizer, activation=activation, dropout_rate=dropout_rate, neurons=neurons)
                        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
                        _, accuracy = model.evaluate(X_test, y_test, verbose=0)
                        if accuracy > best_accuracy:
                            best_accuracy = accuracy
                            best_params = {
                                'neurons': neurons,
                                'activation': activation,
                                'optimizer': optimizer,
                                'dropout_rate': dropout_rate,
                                'batch_size': batch_size,
                                'epochs': epochs
                            }

print(f"Best parameters: {best_params}")

# Train the LSTM model with the best parameters for more epochs
best_model = create_lstm_model(optimizer=best_params['optimizer'],
                               activation=best_params['activation'],
                               dropout_rate=best_params['dropout_rate'],
                               neurons=best_params['neurons'])
history = best_model.fit(X_train, y_train, epochs=20, batch_size=best_params['batch_size'], validation_split=0.2, verbose=1)

# Save the training history
history_df = pd.DataFrame(history.history)
history_df['epoch'] = range(1, len(history_df) + 1)
history_df.to_csv('lstm_training_history.csv', index=False)

# Make predictions
y_pred = (best_model.predict(X_test) > 0.5).astype("int32")

# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score of the LSTM model :")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Best parameters: {'neurons': 50, 'activation': 'relu', 'optimizer': 'adam', 'dropout_rate': 0.2, 'batch_size': 32, 'epochs': 5}
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.98      1721
         1.0       0.95      0.91      0.93       586

    accuracy                           0.96      2307
   macro avg       0.96      0.95      0.95      2307
weighted avg       0.96      0.96      0.96      2307


Accuracy Score of the LSTM model :
Accuracy: 96.49%
