In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)
print(features)
# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.2, random_state=42)

# Define AdaBoost classifier with a decision tree base estimator
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=2),n_estimators=64,learning_rate=0.5,algorithm='SAMME.R',random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('quantile', QuantileTransformer(n_quantiles=10, random_state=42))
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and AdaBoost classifier for training  
pipeline_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Define 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation on training set
cv_scores = cross_val_score(pipeline_cv, X_train, y_train, cv=cv, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))

# Train the model on the entire training set
pipeline_cv.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_cv = pipeline_cv.predict(X_test)
test_accuracy_cv = accuracy_score(y_test, y_pred_cv)
print("Testing Set Accuracy  :", test_accuracy_cv)

# Create pipeline with preprocessing and AdaBoost classifier for testing without cross-validation
pipeline_test = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Train the model on the entire training set without cross-validation
pipeline_test.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_test = pipeline_test.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)

from sklearn.metrics import precision_score, f1_score, roc_curve, auc, recall_score, cohen_kappa_score, log_loss, matthews_corrcoef

print("printing precision")
print(precision_score(y_test, y_pred_test, average='macro'))
print("f1-score")

# # F1 Score
print(f1_score(y_test, y_pred_test,average='macro'))

import matplotlib.pyplot as plt

# ... (your existing code)

# Test the model on the separate testing set
y_prob_test = pipeline_cv.predict_proba(X_test)[:, 1]
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)

from sklearn.metrics import roc_auc_score

# Test the model on the separate testing set
y_prob_test = pipeline_cv.predict_proba(X_test)[:, 1]
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_prob_test)
print("ROC AUC:", roc_auc)

print('recall')
# # Recall
print(recall_score(y_test, y_pred_test,average='macro'))

print('kappa score')
# # Kappa Score
print(cohen_kappa_score(y_test, y_pred_test))

print('log loss')
# # Log Loss
print(log_loss(y_test, pipeline_cv.predict_proba(X_test)))

print('MCC')
# # Matthews Correlation Coefficient
print(matthews_corrcoef(y_test, y_pred_test))



      A1  A2  A3  A4  A5  A6  A7  A8  A9  A10  Age_Mons
0      0   0   0   0   0   0   1   1   0    1        28
1      1   1   0   0   0   1   1   0   0    0        36
2      1   0   0   0   0   0   1   1   0    1        36
3      1   1   1   1   1   1   1   1   1    1        24
4      1   1   0   1   1   1   1   1   1    1        20
...   ..  ..  ..  ..  ..  ..  ..  ..  ..  ...       ...
1049   0   0   0   0   0   0   0   0   0    1        24
1050   0   0   1   1   1   0   1   0   1    0        12
1051   1   0   1   1   1   1   1   1   1    1        18
1052   1   0   0   0   0   0   0   1   0    1        19
1053   1   1   0   0   1   1   0   1   1    0        24

[1054 rows x 11 columns]
Cross-validation scores: [0.95294118 1.         0.97647059 0.98809524 1.         0.98809524
 0.97619048 0.96428571 0.98809524 0.96428571]
Mean CV accuracy: 0.9798459383753502
Testing Set Accuracy  : 0.990521327014218
printing precision
0.9892324964278425
f1-score
0.9892324964278425
ROC AUC: 0.99979587

In [6]:
# Train AdaBoost classifier without feature selection for obtaining feature importances
ada_boost.fit(X_train, y_train)

# Get feature importances
feature_importances = np.mean([
    tree.feature_importances_ for tree in ada_boost.estimators_
], axis=0)

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': result_df.columns,
                              'Importance': feature_importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print("Feature Importances based on Information Gain:")
print(importance_df)


Feature Importances based on Information Gain:
                                            Feature  Importance
7                                                A8    0.114544
3                                                A4    0.102255
4                                                A5    0.099527
8                                                A9    0.097973
5                                                A6    0.096395
9                                               A10    0.094654
1                                                A2    0.092205
0                                                A1    0.085004
6                                                A7    0.074382
2                                                A3    0.060700
10                                         Age_Mons    0.021336
19                                  Ethnicity_asian    0.010722
24                                      Jaundice_no    0.009431
25                                     Jaundice_yes    0.

Cross-validation scores: [1.         0.98947368 1.         0.98947368 0.96842105 0.98947368
 0.98947368 0.98947368 1.         0.9893617 ]
Mean CV accuracy: 0.990515117581187
Testing Set Accuracy  : 0.9811320754716981
printing precision
0.9873417721518987
f1-score
0.9757326007326008
ROC AUC: 1.0
recall
0.9655172413793103
kappa score
0.951487414187643
log loss
0.02214772948335281
MCC
0.9526090433773056


In [7]:
print('INFO GAIN ATTRIBUTE EVALUATOR')
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.2, random_state=42)

# Define AdaBoost classifier with a decision tree base estimator
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=2),n_estimators=64,learning_rate=0.5,algorithm='SAMME.R',random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('quantile', QuantileTransformer(n_quantiles=10, random_state=42))
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and AdaBoost classifier for training  
pipeline_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Define 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation on training set
cv_scores = cross_val_score(pipeline_cv, X_train, y_train, cv=cv, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))

# Train the model on the entire training set
pipeline_cv.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_cv = pipeline_cv.predict(X_test)
test_accuracy_cv = accuracy_score(y_test, y_pred_cv)
print("Testing Set Accuracy  :", test_accuracy_cv)

# Create pipeline with preprocessing and AdaBoost classifier for testing without cross-validation
pipeline_test = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Train the model on the entire training set without cross-validation
pipeline_test.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_test = pipeline_test.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)

from sklearn.metrics import precision_score, f1_score, roc_curve, auc, recall_score, cohen_kappa_score, log_loss, matthews_corrcoef

print("printing precision")
print(precision_score(y_test, y_pred_test, average='macro'))
print("f1-score")

# # F1 Score
print(f1_score(y_test, y_pred_test,average='macro'))

import matplotlib.pyplot as plt

# ... (your existing code)

# Test the model on the separate testing set
y_prob_test = pipeline_cv.predict_proba(X_test)[:, 1]
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)

from sklearn.metrics import roc_auc_score

# Test the model on the separate testing set
y_prob_test = pipeline_cv.predict_proba(X_test)[:, 1]
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_prob_test)
print("ROC AUC:", roc_auc)

print('recall')
# # Recall
print(recall_score(y_test, y_pred_test,average='macro'))

print('kappa score')
# # Kappa Score
print(cohen_kappa_score(y_test, y_pred_test))

print('log loss')
# # Log Loss
print(log_loss(y_test, pipeline_cv.predict_proba(X_test)))

print('MCC')
# # Matthews Correlation Coefficient
print(matthews_corrcoef(y_test, y_pred_test))

# Train AdaBoost classifier without feature selection for obtaining feature importances
ada_boost.fit(X_train, y_train)

# Get feature importances
feature_importances = np.mean([
    tree.feature_importances_ for tree in ada_boost.estimators_
], axis=0)

# Sum feature importances for each categorical column
categorical_importances = {}
for col in category_features.columns:
    cat_mask = result_df.columns.str.startswith(col)
    cat_importance = np.sum(feature_importances[cat_mask])
    categorical_importances[col] = cat_importance

# Print feature importances for each categorical column
print("\nFeature Importances for Categorical Columns:")
for col, importance in categorical_importances.items():
    print(f"{col}: {importance}")

# Sum feature importances for numerical columns
numerical_importances = {}
for col in ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']:
    num_importance = np.sum(feature_importances[result_df.columns == col])
    numerical_importances[col] = num_importance

# Print feature importances for each numerical column
print("\nFeature Importances for Numerical Columns:")
for col, importance in numerical_importances.items():
    print(f"{col}: {importance}")


Cross-validation scores: [0.95294118 1.         0.97647059 0.98809524 1.         0.98809524
 0.97619048 0.96428571 0.98809524 0.96428571]
Mean CV accuracy: 0.9798459383753502
Testing Set Accuracy  : 0.990521327014218
printing precision
0.9892324964278425
f1-score
0.9892324964278425
ROC AUC: 0.9997958767095325
recall
0.9892324964278425
kappa score
0.9784649928556848
log loss
0.3636137540542667
MCC
0.9784649928556848

Feature Importances for Categorical Columns:
Sex: 0.003348666145350046
Ethnicity: 0.035730864848746643
Jaundice: 0.016826863169141844
Family_mem_with_ASD: 0.005118422606748412
Who completed the test: 0.0

Feature Importances for Numerical Columns:
A1: 0.08500356826098951
A2: 0.09220547606161869
A3: 0.060699924065468605
A4: 0.10225539522989212
A5: 0.09952676701191547
A6: 0.0963954593766552
A7: 0.07438192171837442
A8: 0.11454351632626306
A9: 0.09797299533272968
A10: 0.09465370098958809
Age_Mons: 0.021336458856518196


In [2]:
print('GAIN RATIO ATTRIBUTE EVALUATOR -  not working')
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns from original features
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One-hot encoding for categorical features
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, pd.DataFrame(category_transformed, columns=category_encoded_columns)], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.19, random_state=42)

# Define AdaBoost classifier with a decision tree base estimator
base_estimator = DecisionTreeClassifier(max_depth=5, min_samples_split=3, criterion='entropy')
ada_boost = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=300, learning_rate=0.1, algorithm='SAMME.R', random_state=42)

# Preprocessing pipeline with Normalizer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), category_encoded_columns)
    ],
    remainder='passthrough'
)

# Create pipeline with preprocessing and AdaBoost classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__learning_rate': [0.1, 0.5, 1.0]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Test the model on the separate testing set
y_pred_test = grid_search.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy:", test_accuracy_test)

# Calculate precision and F1 score
precision = precision_score(y_test, y_pred_test, average='macro')
f1 = f1_score(y_test, y_pred_test, average='macro')

print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

# Retrieve the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Calculate and print feature importances using Gain Ratio Attribute Evaluator
# Get base estimator (decision tree) from AdaBoost classifier
base_estimator = best_model.named_steps['classifier'].estimators_[0]

# Calculate feature importances
if hasattr(base_estimator, 'feature_importances_'):
    # Get feature importances
    feature_importances = base_estimator.feature_importances_

    # Get feature names from preprocessor
    num_feature_names = list(best_model.named_steps['preprocessor'].transformers_[0][2])
    cat_feature_names = list(best_model.named_steps['preprocessor'].transformers_[1][1]['onehot'].get_feature_names_out(category_encoded_columns))

    # Combine numerical and categorical feature names
    feature_names = num_feature_names + cat_feature_names

    # Print feature importances
    print("\nFeature Importances using Gain Ratio Attribute Evaluator:")
    for name, importance in zip(feature_names, feature_importances):
        print(f"{name}: {importance:.4f}")
else:
    print("Base estimator does not support feature importances.")




Best Parameters: {'classifier__learning_rate': 1.0, 'classifier__n_estimators': 150}
Testing Set Accuracy: 0.9751243781094527
Precision: 0.9823943661971831
F1 Score: 0.970714223271265

Feature Importances using Gain Ratio Attribute Evaluator:
A1: 0.1721
A2: 0.0692
A3: 0.0537
A4: 0.0559
A5: 0.1511
A6: 0.0118
A7: 0.0000
A8: 0.0933
A9: 0.3883
A10: 0.0000
Age_Mons: 0.0000
Sex_f_0.0: 0.0000
Sex_f_1.0: 0.0046
Sex_m_0.0: 0.0000
Sex_m_1.0: 0.0000
Ethnicity_Hispanic_0.0: 0.0000
Ethnicity_Hispanic_1.0: 0.0000
Ethnicity_Latino_0.0: 0.0000
Ethnicity_Latino_1.0: 0.0000
Ethnicity_Native Indian_0.0: 0.0000
Ethnicity_Native Indian_1.0: 0.0000
Ethnicity_Others_0.0: 0.0000
Ethnicity_Others_1.0: 0.0000
Ethnicity_Pacifica_0.0: 0.0000
Ethnicity_Pacifica_1.0: 0.0000
Ethnicity_White European_0.0: 0.0000
Ethnicity_White European_1.0: 0.0000
Ethnicity_asian_0.0: 0.0000
Ethnicity_asian_1.0: 0.0000
Ethnicity_black_0.0: 0.0000
Ethnicity_black_1.0: 0.0000
Ethnicity_middle eastern_0.0: 0.0000
Ethnicity_middle easte