In [2]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, Dropout, MaxPooling1D
from keras.utils import to_categorical

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['thesis_db']  # Replace with your database name
collection = db['research_data']  # Replace with your collection name

# Retrieve data from MongoDB
data = list(collection.find())

# Convert to DataFrame
df = pd.DataFrame(data)

# Drop the MongoDB '_id' field if it exists
if '_id' in df.columns:
    df = df.drop('_id', axis=1)

# Data Cleaning
df.fillna(df.mean(numeric_only=True), inplace=True)
df.drop_duplicates(inplace=True)
numeric_cols = df.select_dtypes(include=[float, int]).columns

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

for col in numeric_cols:
    df = remove_outliers_iqr(df, col)

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
label_encoder = LabelEncoder()
df['Class/ASD'] = label_encoder.fit_transform(df['Class/ASD'])
df_encoded = pd.get_dummies(df, drop_first=True)
df_selected = df_encoded.select_dtypes(include=[np.number])

# PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(df_selected)
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
pca_df.to_csv('pca_transformed_data.csv', index=False)

# Split the data into features and target
X = df_encoded.drop(columns=['Class/ASD'])
y = df['Class/ASD']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape the data for the CNN model
X_train_cnn = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1)).astype('float32')
X_test_cnn = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1)).astype('float32')

# Convert the target variable to categorical
y_train_cnn = to_categorical(y_train).astype('float32')
y_test_cnn = to_categorical(y_test).astype('float32')

# Build the CNN model
cnn_model = Sequential()
cnn_model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train_cnn.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Dropout(0.5))
cnn_model.add(Flatten())
cnn_model.add(Dense(50, activation='relu'))
cnn_model.add(Dense(y_train_cnn.shape[1], activation='softmax'))

# Compile the model
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model and capture the history
history = cnn_model.fit(X_train_cnn, y_train_cnn, epochs=50, batch_size=32, validation_data=(X_test_cnn, y_test_cnn))

# Save the training history
history_df = pd.DataFrame(history.history)
history_df['epoch'] = history.epoch
history_df.to_csv('cnn_training_history.csv', index=False)

# Evaluate the model
y_pred_cnn = cnn_model.predict(X_test_cnn)
y_pred_cnn_classes = np.argmax(y_pred_cnn, axis=1)
y_test_cnn_classes = np.argmax(y_test_cnn, axis=1)

# Print classification report and accuracy score
print("\nClassification Report:")
print(classification_report(y_test_cnn_classes, y_pred_cnn_classes))
print("\nAccuracy Score of the CNN model :")
accuracy = accuracy_score(y_test_cnn_classes, y_pred_cnn_classes)
print(f"Accuracy: {accuracy * 100:.2f}%")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1721
           1       0.94      0.96      0.95       586

    accuracy                           0.97      2307
   macro avg       0.96      0.97      0.96      2307
weighted avg       0.97      0.97      0.97      2307


Accuracy Score of the CNN model :
Accuracy: 97.23%
