In [3]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['thesis_db']  # Replace with your database name
collection = db['research_data']  # Replace with your collection name

# Retrieve data from MongoDB
data = list(collection.find())

# Convert to DataFrame
df = pd.DataFrame(data)

# Drop the MongoDB '_id' field if it exists
if '_id' in df.columns:
    df = df.drop('_id', axis=1)

# Data Cleaning and Preprocessing
df.fillna(df.mean(numeric_only=True), inplace=True)
df.drop_duplicates(inplace=True)
numeric_cols = df.select_dtypes(include=[float, int]).columns

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

for col in numeric_cols:
    df = remove_outliers_iqr(df, col)

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
label_encoder = LabelEncoder()
df['Class/ASD'] = label_encoder.fit_transform(df['Class/ASD'])
df_encoded = pd.get_dummies(df, drop_first=True)
df_selected = df_encoded.select_dtypes(include=[np.number])
 
# PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(df_selected)
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
pca_df.to_csv('pca_transformed_data.csv', index=False)

# Split the data into features and target
X = df_encoded.drop(columns=['Class/ASD'])
y = df['Class/ASD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using RandomizedSearchCV with fewer iterations and simpler search space
param_distributions = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['tanh', 'relu'],
    'learning_rate': ['constant'],
    'alpha': [0.0001, 0.001]
}

random_search = RandomizedSearchCV(
    MLPClassifier(solver='adam', max_iter=200, random_state=42, early_stopping=True),
    param_distributions,
    n_iter=10,  # Reduce n_iter for quicker results
    n_jobs=-1,
    cv=3,
    scoring='accuracy',
    random_state=42
)

random_search.fit(X_train, y_train)
best_params = random_search.best_params_

# Train the MLP model with the best parameters
mlp_best = MLPClassifier(**best_params, solver='adam', max_iter=200, random_state=42, early_stopping=True)
mlp_best.fit(X_train, y_train)

# Save accuracy and loss history
history_df = pd.DataFrame({
    'epoch': np.arange(len(mlp_best.loss_curve_)),
    'loss': mlp_best.loss_curve_,
    'accuracy': mlp_best.score(X_train, y_train)
})
history_df.to_csv('mlp_training_history.csv', index=False)

# Evaluate the model
y_pred = mlp_best.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score of the MLP model:")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

df_selected['Predicted'] = mlp_best.predict(X)
df_selected['Actual'] = df['Class/ASD']
df_selected.to_csv('MLP_model.csv', index=False)





Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1721
           1       0.93      0.94      0.94       586

    accuracy                           0.97      2307
   macro avg       0.96      0.96      0.96      2307
weighted avg       0.97      0.97      0.97      2307


Accuracy Score of the MLP model:
Accuracy: 96.71%
