In [4]:
print("RELIEF ATTRIBUTE EVALUATOR")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from skrebate import ReliefF

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.19, random_state=42)

# ReliefF feature selection
num_features_to_select = 10  # You can adjust this value
# Initialize the ReliefF feature selector
relief_selector = ReliefF(n_features_to_select=num_features_to_select)

# Fit ReliefF selector on training data
relief_selector.fit(X_train.to_numpy(), y_train.to_numpy())

# Get selected features
selected_features_indices = relief_selector.top_features_
selected_features_names = result_df.columns[selected_features_indices]

# Filter selected features in the dataset
X_train_selected = X_train[selected_features_names]
X_test_selected = X_test[selected_features_names]

# Define AdaBoost classifier with a decision tree base estimator
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=2),
                               n_estimators=100, learning_rate=0.5, algorithm='SAMME.R', random_state=42)

# Fit AdaBoost classifier
ada_boost.fit(X_train_selected, y_train)

# Print the feature importance scores
print("Feature Importance Scores:")
for feature, importance_score in zip(selected_features_names, ada_boost.feature_importances_):
    print(f"{feature}: {importance_score:.4f}")

# Test the model on the separate testing set
y_pred_test = ada_boost.predict(X_test_selected)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy:", test_accuracy_test)

RELIEF ATTRIBUTE EVALUATOR
Feature Importance Scores:
A9: 0.0912
A6: 0.0984
A5: 0.0972
A7: 0.0714
A4: 0.1090
A1: 0.0903
A2: 0.0882
A8: 0.1080
A3: 0.0596
Ethnicity_asian: 0.0045
A10: 0.0872
Ethnicity_middle eastern: 0.0025
Jaundice_no: 0.0101
Jaundice_yes: 0.0020
Age_Mons: 0.0375
Sex_m: 0.0061
Sex_f: 0.0000
Ethnicity_White European: 0.0000
Ethnicity_south asian: 0.0033
Family_mem_with_ASD_no: 0.0030
Family_mem_with_ASD_yes: 0.0111
Ethnicity_Hispanic: 0.0019
Ethnicity_mixed: 0.0023
Ethnicity_Latino: 0.0053
Who completed the test_Health Care Professional: 0.0000
Ethnicity_Native Indian: 0.0000
Who completed the test_Others: 0.0000
Who completed the test_Self: 0.0000
Who completed the test_Health care professional: 0.0000
Who completed the test_family member: 0.0000
Ethnicity_Pacifica: 0.0000
Ethnicity_black: 0.0038
Ethnicity_Others: 0.0064
Testing Set Accuracy: 0.9900497512437811


In [2]:
print("CORRELATION ATTRIBUTE EVALUATOR")
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.19, random_state=42)

# Correlation-based feature selection
num_features_to_select = 10  # You can adjust this value
selector = SelectKBest(score_func=f_classif, k=num_features_to_select)
selector.fit(X_train, y_train)

# Get selected features
selected_features_indices = selector.get_support(indices=True)
selected_features_names = result_df.columns[selected_features_indices]

# Filter selected features in the dataset
X_train_selected = X_train[selected_features_names]
X_test_selected = X_test[selected_features_names]

# Define AdaBoost classifier with a decision tree base estimator
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=2),
                               n_estimators=100, learning_rate=0.5, algorithm='SAMME.R', random_state=42)

# Fit AdaBoost classifier
ada_boost.fit(X_train_selected, y_train)

feature_importance_scores = ada_boost.feature_importances_


# Sum the feature importance scores for categorical attributes
categorical_columns = category_features.columns
categorical_importance_scores = {}
for i, col in enumerate(categorical_columns):
    indices = [j for j in range(10) if col in X_train.columns[j]]
    categorical_importance_scores[col] = np.sum(feature_importance_scores[indices])

# Print the summed feature importance scores for categorical attributes
print("Summed Feature Importance Scores for Categorical Attributes:")
for col, importance_score in categorical_importance_scores.items():
    print(f"{col}: {importance_score:.4f}")


# Print the feature importance scores
print("Feature Importance Scores:")
for feature, importance_score in zip(selected_features_names, ada_boost.feature_importances_):
    print(f"{feature}: {importance_score:.4f}")

# Test the model on the separate testing set
y_pred_test = ada_boost.predict(X_test_selected)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy:", test_accuracy_test)

CORRELATION ATTRIBUTE EVALUATOR
Summed Feature Importance Scores for Categorical Attributes:
Sex: 0.0000
Ethnicity: 0.0000
Jaundice: 0.0000
Family_mem_with_ASD: 0.0000
Who completed the test: 0.0000
Feature Importance Scores:
A1: 0.1080
A2: 0.0919
A3: 0.0714
A4: 0.1152
A5: 0.1055
A6: 0.1066
A7: 0.0940
A8: 0.1113
A9: 0.0866
A10: 0.1096
Testing Set Accuracy: 0.9950248756218906


In [None]:
print('INFO GAIN ATTRIBUTE EVALUATOR')
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.2, random_state=42)

# Define AdaBoost classifier with a decision tree base estimator
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=2),n_estimators=64,learning_rate=0.5,algorithm='SAMME.R',random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('quantile', QuantileTransformer(n_quantiles=10, random_state=42))
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and AdaBoost classifier for training  
pipeline_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Define 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation on training set
cv_scores = cross_val_score(pipeline_cv, X_train, y_train, cv=cv, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))

# Train the model on the entire training set
pipeline_cv.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_cv = pipeline_cv.predict(X_test)
test_accuracy_cv = accuracy_score(y_test, y_pred_cv)
print("Testing Set Accuracy  :", test_accuracy_cv)

# Create pipeline with preprocessing and AdaBoost classifier for testing without cross-validation
pipeline_test = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Train the model on the entire training set without cross-validation
pipeline_test.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_test = pipeline_test.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)

from sklearn.metrics import precision_score, f1_score, roc_curve, auc, recall_score, cohen_kappa_score, log_loss, matthews_corrcoef

print("printing precision")
print(precision_score(y_test, y_pred_test, average='macro'))
print("f1-score")

# # F1 Score
print(f1_score(y_test, y_pred_test,average='macro'))

import matplotlib.pyplot as plt

# ... (your existing code)

# Test the model on the separate testing set
y_prob_test = pipeline_cv.predict_proba(X_test)[:, 1]
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)

from sklearn.metrics import roc_auc_score

# Test the model on the separate testing set
y_prob_test = pipeline_cv.predict_proba(X_test)[:, 1]
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_prob_test)
print("ROC AUC:", roc_auc)

print('recall')
# # Recall
print(recall_score(y_test, y_pred_test,average='macro'))

print('kappa score')
# # Kappa Score
print(cohen_kappa_score(y_test, y_pred_test))

print('log loss')
# # Log Loss
print(log_loss(y_test, pipeline_cv.predict_proba(X_test)))

print('MCC')
# # Matthews Correlation Coefficient
print(matthews_corrcoef(y_test, y_pred_test))

# Train AdaBoost classifier without feature selection for obtaining feature importances
ada_boost.fit(X_train, y_train)

# Get feature importances
feature_importances = np.mean([
    tree.feature_importances_ for tree in ada_boost.estimators_
], axis=0)

# Sum feature importances for each categorical column
categorical_importances = {}
for col in category_features.columns:
    cat_mask = result_df.columns.str.startswith(col)
    cat_importance = np.sum(feature_importances[cat_mask])
    categorical_importances[col] = cat_importance

# Print feature importances for each categorical column
print("\nFeature Importances for Categorical Columns:")
for col, importance in categorical_importances.items():
    print(f"{col}: {importance}")

# Sum feature importances for numerical columns
numerical_importances = {}
for col in ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']:
    num_importance = np.sum(feature_importances[result_df.columns == col])
    numerical_importances[col] = num_importance

# Print feature importances for each numerical column
print("\nFeature Importances for Numerical Columns:")
for col, importance in numerical_importances.items():
    print(f"{col}: {importance}")


In [None]:
print('GAIN RATIO ATTRIBUTE EVALUATOR -  not working')
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns from original features
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One-hot encoding for categorical features
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, pd.DataFrame(category_transformed, columns=category_encoded_columns)], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.19, random_state=42)

# Define AdaBoost classifier with a decision tree base estimator
base_estimator = DecisionTreeClassifier(max_depth=5, min_samples_split=3, criterion='entropy')
ada_boost = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=300, learning_rate=0.1, algorithm='SAMME.R', random_state=42)

# Preprocessing pipeline with Normalizer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), category_encoded_columns)
    ],
    remainder='passthrough'
)

# Create pipeline with preprocessing and AdaBoost classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__learning_rate': [0.1, 0.5, 1.0]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Test the model on the separate testing set
y_pred_test = grid_search.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy:", test_accuracy_test)

# Calculate precision and F1 score
precision = precision_score(y_test, y_pred_test, average='macro')
f1 = f1_score(y_test, y_pred_test, average='macro')

print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

# Retrieve the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Calculate and print feature importances using Gain Ratio Attribute Evaluator
# Get base estimator (decision tree) from AdaBoost classifier
base_estimator = best_model.named_steps['classifier'].estimators_[0]

# Calculate feature importances
if hasattr(base_estimator, 'feature_importances_'):
    # Get feature importances
    feature_importances = base_estimator.feature_importances_

    # Get feature names from preprocessor
    num_feature_names = list(best_model.named_steps['preprocessor'].transformers_[0][2])
    cat_feature_names = list(best_model.named_steps['preprocessor'].transformers_[1][1]['onehot'].get_feature_names_out(category_encoded_columns))

    # Combine numerical and categorical feature names
    feature_names = num_feature_names + cat_feature_names

    # Print feature importances
    print("\nFeature Importances using Gain Ratio Attribute Evaluator:")
    for name, importance in zip(feature_names, feature_importances):
        print(f"{name}: {importance:.4f}")
else:
    print("Base estimator does not support feature importances.")
