In [56]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.1, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('quantile', QuantileTransformer(n_quantiles=300, random_state=69))
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and Gaussian Naive Bayes classifier for training
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])

# Define parameter grid for Gaussian Naive Bayes
param_grid = {
    'classifier__var_smoothing': np.logspace(5,-9, num=10) # Varying var_smoothing parameter
}

# Define 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=35)

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best model on the testing set
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy:", test_accuracy)

# Additional imports
from sklearn.metrics import precision_score, f1_score, roc_auc_score, recall_score, cohen_kappa_score, log_loss, matthews_corrcoef

# Evaluate the best model on the testing set
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

# Calculate precision
precision = precision_score(y_test, y_pred_test, average='macro')
print("Precision:", precision)

# Calculate F1-score
f1 = f1_score(y_test, y_pred_test, average='macro')
print("F1-score:", f1)

# Calculate ROC AUC
y_prob_test = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob_test)
print("ROC AUC:", roc_auc)

# Calculate recall
recall = recall_score(y_test, y_pred_test, average='macro')
print("Recall:", recall)

# Calculate Kappa score
kappa_score = cohen_kappa_score(y_test, y_pred_test)
print("Kappa Score:", kappa_score)

# Calculate log loss
logloss = log_loss(y_test, best_model.predict_proba(X_test))
print("Log Loss:", logloss)

# Calculate Matthews Correlation Coefficient
mcc = matthews_corrcoef(y_test, y_pred_test)
print("Matthews Correlation Coefficient:", mcc)



Best Parameters: {'classifier__var_smoothing': 2.154434690031882}
Best Score: 0.9535834266517359
Testing Set Accuracy: 0.9811320754716981
Precision: 0.967741935483871
F1-score: 0.9767543859649122
ROC AUC: 0.9977608598298253
Recall: 0.987012987012987
Kappa Score: 0.9535291538798772
Log Loss: 0.16175233379085582
Matthews Correlation Coefficient: 0.9545604164247246




In [95]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, recall_score, cohen_kappa_score, log_loss, matthews_corrcoef

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.09, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('power', PowerTransformer(method='yeo-johnson', standardize=True))
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and Gaussian Naive Bayes classifier for training
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])

# Define parameter grid for Gaussian Naive Bayes
param_grid = {
    'classifier__var_smoothing': np.logspace(1,-1, num=10) # Varying var_smoothing parameter
}

# Define 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=35)

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best model on the testing set
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy:", test_accuracy)

# Evaluate other metrics
# Calculate precision
precision = precision_score(y_test, y_pred_test, average='macro')
print("Precision:", precision)

# Calculate F1-score
f1 = f1_score(y_test, y_pred_test, average='macro')
print("F1-score:", f1)

# Calculate ROC AUC
y_prob_test = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob_test)
print("ROC AUC:", roc_auc)

# Calculate recall
recall = recall_score(y_test, y_pred_test, average='macro')
print("Recall:", recall)

# Calculate Kappa score
kappa_score = cohen_kappa_score(y_test, y_pred_test)
print("Kappa Score:", kappa_score)

# Calculate log loss
logloss = log_loss(y_test, best_model.predict_proba(X_test))
print("Log Loss:", logloss)

# Calculate Matthews Correlation Coefficient
mcc = matthews_corrcoef(y_test, y_pred_test)
print("Matthews Correlation Coefficient:", mcc)


Best Parameters: {'classifier__var_smoothing': 2.154434690031884}
Best Score: 0.9655811403508773
Testing Set Accuracy: 0.9894736842105263
Precision: 0.9827586206896552
F1-score: 0.9874686716791979
ROC AUC: 1.0
Recall: 0.9925373134328358
Kappa Score: 0.9749406489053021
Log Loss: 0.15181935596283097
Matthews Correlation Coefficient: 0.975246910420175




In [189]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, recall_score, cohen_kappa_score, log_loss, matthews_corrcoef

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.1, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and Gaussian Naive Bayes classifier for training
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])

# Define parameter grid for Gaussian Naive Bayes
param_grid = {
    'classifier__var_smoothing': np.logspace(1,-2, num=10) # Varying var_smoothing parameter
}

# Define 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best model on the testing set
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy:", test_accuracy)

# Evaluate other metrics
# Calculate precision
precision = precision_score(y_test, y_pred_test, average='macro')
print("Precision:", precision)

# Calculate F1-score
f1 = f1_score(y_test, y_pred_test, average='macro')
print("F1-score:", f1)

# Calculate ROC AUC
y_prob_test = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob_test)
print("ROC AUC:", roc_auc)

# Calculate recall
recall = recall_score(y_test, y_pred_test, average='macro')
print("Recall:", recall)

# Calculate Kappa score
kappa_score = cohen_kappa_score(y_test, y_pred_test)
print("Kappa Score:", kappa_score)

# Calculate log loss
logloss = log_loss(y_test, best_model.predict_proba(X_test))
print("Log Loss:", logloss)

# Calculate Matthews Correlation Coefficient
mcc = matthews_corrcoef(y_test, y_pred_test)
print("Matthews Correlation Coefficient:", mcc)


Best Parameters: {'classifier__var_smoothing': 0.21544346900318845}
Best Score: 0.9209182530795073
Testing Set Accuracy: 0.9716981132075472
Precision: 0.953125
F1-score: 0.9654760612311366
ROC AUC: 0.9914912673533363
Recall: 0.9805194805194806
Kappa Score: 0.9310195227765726
Log Loss: 0.1067825093691721
Matthews Correlation Coefficient: 0.9332424971257783




In [190]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, recall_score, cohen_kappa_score, log_loss, matthews_corrcoef

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.1, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('max_abs_scaler', MaxAbsScaler())  # Replace Normalizer with MaxAbsScaler
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and Gaussian Naive Bayes classifier for training
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])

# Define parameter grid for Gaussian Naive Bayes
param_grid = {
    'classifier__var_smoothing': np.logspace(1,-2, num=10) # Varying var_smoothing parameter
}

# Define 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best model on the testing set
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy:", test_accuracy)

# Evaluate other metrics
# Calculate precision
precision = precision_score(y_test, y_pred_test, average='macro')
print("Precision:", precision)

# Calculate F1-score
f1 = f1_score(y_test, y_pred_test, average='macro')
print("F1-score:", f1)

# Calculate ROC AUC
y_prob_test = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob_test)
print("ROC AUC:", roc_auc)

# Calculate recall
recall = recall_score(y_test, y_pred_test, average='macro')
print("Recall:", recall)

# Calculate Kappa score
kappa_score = cohen_kappa_score(y_test, y_pred_test)
print("Kappa Score:", kappa_score)

# Calculate log loss
logloss = log_loss(y_test, best_model.predict_proba(X_test))
print("Log Loss:", logloss)

# Calculate Matthews Correlation Coefficient
mcc = matthews_corrcoef(y_test, y_pred_test)
print("Matthews Correlation Coefficient:", mcc)


Best Parameters: {'classifier__var_smoothing': 2.154434690031884}
Best Score: 0.9578163493840985
Testing Set Accuracy: 0.9716981132075472
Precision: 0.9600877192982455
F1-score: 0.9647723496178133
ROC AUC: 0.9986565158978953
Recall: 0.9697716077026421
Kappa Score: 0.9295525033229951
Log Loss: 0.1818012071681241
Matthews Correlation Coefficient: 0.9298088998906792


