In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, recall_score, cohen_kappa_score, log_loss, matthews_corrcoef
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import RandomOverSampler

# Load dataset
df = pd.read_csv("D:\Sem 6\Mini Project\csv_result-Autism-Child-Data.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

# Drop categorical columns
features.drop(features.columns[-10:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.3, random_state=41)

# Perform random oversampling only on the training data
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Define AdaBoost classifier with a decision tree base estimator
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=2), n_estimators=32, learning_rate=0.1, algorithm='SAMME.R', random_state=35)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())  # Using Normalizer
        ]), ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), category_encoded_columns)
    ],
    remainder='passthrough'
)

# Create pipeline with preprocessing and AdaBoost classifier for training and testing
pipeline_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Fit the pipeline on resampled training data
pipeline_cv.fit(X_train_resampled, y_train_resampled)

# Test the model on the separate testing set
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy  :", test_accuracy_test)

# Calculate feature importance using Info Gain Attribute Evaluator
# Since AdaBoost does not directly provide feature importance, we can use SelectKBest with mutual_info_classif
selector = SelectKBest(score_func=mutual_info_classif, k='all')
selector.fit(X_train_resampled, y_train_resampled)

# Get selected features and their scores
selected_features = result_df.columns[selector.get_support()]
feature_scores = selector.scores_

# Print feature importance scores
print("Feature Importance Scores:")
for feature, score in zip(selected_features, feature_scores):
    print(f"{feature}: {score}")


Testing Set Accuracy  : 0.9772727272727273
Feature Importance Scores:
A1_Score: 0.07220600483359552
A2_Score: 0.027542626452478736
A3_Score: 0.10835166682575825
A4_Score: 0.1614893705122702
A5_Score: 0.06447992248817047
A6_Score: 0.1019493287148372
A7_Score: 0.0
A8_Score: 0.10183904316889492
A9_Score: 0.07730431009874472
A10_Score: 0.07795845532766577
age_10: 0.000887356748143775
age_11: 0.018214551879077057
age_4: 0.0
age_5: 0.0
age_6: 0.0
age_7: 0.007419032497109068
age_8: 0.014782100374195473
age_9: 0.0874122201627816
age_?: 0.01025655063836517
gender_f: 0.006456573357079609
gender_m: 0.027611379091324384
ethnicity_?: 0.0398044759367393
ethnicity_Asian: 0.0
ethnicity_Black: 0.0
ethnicity_Hispanic: 0.03452271152835795
ethnicity_Latino: 0.0
ethnicity_Middle Eastern : 0.0
ethnicity_Others: 0.018291052481278713
ethnicity_Pasifika: 0.0
ethnicity_South Asian: 0.0
ethnicity_Turkish: 0.04975573266091193
ethnicity_White-European: 0.02174149421168159
jundice_no: 0.03165336514265671
jundice_ye

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import RandomOverSampler

# Load dataset
df = pd.read_csv("D:\Sem 6\Mini Project\csv_result-Autism-Child-Data.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

# Drop categorical columns
features.drop(features.columns[-10:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.3, random_state=41)

# Perform random oversampling only on the training data
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Define AdaBoost classifier with a decision tree base estimator
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=2), n_estimators=32, learning_rate=0.1, algorithm='SAMME.R', random_state=35)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())  # Using Normalizer
        ]), ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), category_encoded_columns)
    ],
    remainder='passthrough'
)

# Create pipeline with preprocessing and AdaBoost classifier for training and testing
pipeline_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Fit the pipeline on resampled training data
pipeline_cv.fit(X_train_resampled, y_train_resampled)

# Test the model on the separate testing set
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy  :", test_accuracy_test)

# Calculate feature importance using Info Gain Attribute Evaluator
# Since AdaBoost does not directly provide feature importance, we can use SelectKBest with mutual_info_classif
selector = SelectKBest(score_func=mutual_info_classif, k='all')
selector.fit(X_train_resampled, y_train_resampled)

# Get selected features and their scores
selected_features = result_df.columns[selector.get_support()]
feature_scores = selector.scores_

# Print feature importance scores
print("Feature Importance Scores:")
for feature, score in zip(selected_features, feature_scores):
    print(f"{feature}: {score}")

# Sum up feature importances of one-hot encoded features by their original categorical features
feature_importance_dict = {}
for col in category_features.columns:
    relevant_columns = [column for column in result_df.columns if col in column]
    importance_sum = sum(selector.scores_[selected_features.get_loc(column)] for column in relevant_columns)
    feature_importance_dict[col] = importance_sum

# Print summed feature importances
print("Summed Feature Importances for One-Hot Encoded Features:")
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")


Testing Set Accuracy  : 0.9772727272727273
Feature Importance Scores:
A1_Score: 0.05053117308599364
A2_Score: 0.007296426650204246
A3_Score: 0.07659475443503982
A4_Score: 0.1825474428438567
A5_Score: 0.1011718137071067
A6_Score: 0.13548521492543952
A7_Score: 0.0
A8_Score: 0.07789105743406521
A9_Score: 0.13676703381109245
A10_Score: 0.09218741549919796
age_10: 0.0
age_11: 0.0
age_4: 0.0
age_5: 0.0
age_6: 0.043989899324985204
age_7: 0.0
age_8: 0.024881839679542495
age_9: 0.0
age_?: 0.025797188608910115
gender_f: 0.03661436987001654
gender_m: 0.05775281086827033
ethnicity_?: 0.0029840075013360945
ethnicity_Asian: 0.0068790221440149235
ethnicity_Black: 0.0
ethnicity_Hispanic: 0.05840227822334554
ethnicity_Latino: 0.0
ethnicity_Middle Eastern : 0.0
ethnicity_Others: 0.02307148751872301
ethnicity_Pasifika: 0.03595544935128281
ethnicity_South Asian: 0.06358858792793476
ethnicity_Turkish: 0.05658476811144264
ethnicity_White-European: 0.0
jundice_no: 0.0
jundice_yes: 0.0
austim_no: 0.0686047639

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from skrebate import ReliefF

# Load dataset
df = pd.read_csv("D:\Sem 6\Mini Project\csv_result-Autism-Child-Data.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

# Drop categorical columns
features.drop(features.columns[-10:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.3, random_state=41)

# Define AdaBoost classifier with a decision tree base estimator
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=2), n_estimators=32, learning_rate=0.1, algorithm='SAMME.R', random_state=35)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())  # Using Normalizer
        ]), slice(0, -10)),  # Numerical features
        ('cat', Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), slice(-10, None))  # Categorical features
    ],
    remainder='passthrough'
)

# Create pipeline with preprocessing and AdaBoost classifier for training and testing
pipeline_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Fit the pipeline on original training data
pipeline_cv.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy  :", test_accuracy_test)

# Calculate feature importance using ReliefF
reliefF = ReliefF(n_features_to_select='all', n_neighbors=100)
reliefF.fit(X_train.values, y_train.values)

# Get selected features and their scores
selected_features = result_df.columns[reliefF.top_features_]
feature_scores = reliefF.feature_importances_

# Print feature importance scores
print("Feature Importance Scores:")
for feature, score in zip(selected_features, feature_scores):
    print(f"{feature}: {score}")

# Sum up feature importances of one-hot encoded features by their original categorical features
feature_importance_dict = {}
for col in category_features.columns:
    relevant_columns = [column for column in result_df.columns if col in column]
    importance_sum = sum(reliefF.feature_importances_[selected_features.get_loc(column)] for column in relevant_columns)
    feature_importance_dict[col] = importance_sum

# Print summed feature importances for original categorical features
print("Summed Feature Importances for Original Categorical Features:")
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")


Testing Set Accuracy  : 0.9772727272727273
Feature Importance Scores:
A4_Score: 0.10544875682231655
A9_Score: 0.03356882959369316
A8_Score: 0.15856428138265613
A3_Score: 0.3519178289872655
A6_Score: 0.1290378006872852
A10_Score: 0.15793662825955113
A5_Score: 0.08723266626238124
A1_Score: 0.17313068526379632
result_8: 0.19401202749140872
result_6: 0.149489589650293
result_5: -0.0006989084293511215
A7_Score: -0.0006372549019607859
result_7: -0.0010501313927632934
result_9: 0.0007797655144532045
result_4: 0.00033454618960986385
A2_Score: 0.0010026278552658172
result_3: -0.0004169193450576059
result_10: 0.0025005053567818875
contry_of_res_United States: 0.0014028704265211243
result_2: -0.004411764705882353
contry_of_res_India: -0.004411764705882353
ethnicity_South Asian: 0.0050692338791186575
contry_of_res_Jordan: -0.0021144127754194474
ethnicity_?: -0.0005392156862745111
relation_?: -0.0006862745098039211
relation_Parent: -0.0009743278754800878
result_1: 0.0002683444511825342
relation_Rel

In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import f_classif
from imblearn.over_sampling import RandomOverSampler

# Load dataset
df = pd.read_csv("D:\Sem 6\Mini Project\csv_result-Autism-Child-Data.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

# Drop categorical columns
features.drop(features.columns[-10:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.3, random_state=41)

# Perform random oversampling only on the training data
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Define AdaBoost classifier with a decision tree base estimator
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=2), n_estimators=32, learning_rate=0.1, algorithm='SAMME.R', random_state=35)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())  # Using Normalizer
        ]), ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), category_encoded_columns)
    ],
    remainder='passthrough'
)

# Create pipeline with preprocessing and AdaBoost classifier for training and testing
pipeline_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Fit the pipeline on resampled training data
pipeline_cv.fit(X_train_resampled, y_train_resampled)

# Test the model on the separate testing set
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy  :", test_accuracy_test)

# Calculate feature importance using Corelation Attribute Evaluator
f_scores, _ = f_classif(X_train_resampled, y_train_resampled)

# Replace NaN with 0 in feature importances
f_scores = np.nan_to_num(f_scores, nan=0)

# Normalize feature importance scores between 0 and 1
f_scores_normalized = (f_scores - f_scores.min()) / (f_scores.max() - f_scores.min())

# Print normalized feature importance scores
print("Normalized Feature Importance Scores (Correlation with Target):")
for feature, score in zip(result_df.columns, f_scores_normalized):
    print(f"{feature}: {score}")

# Sum up feature importances of one-hot encoded features by their original categorical features
feature_importance_dict = {}
for col in category_features.columns:
    relevant_columns = [column for column in result_df.columns if col in column]
    importance_sum = sum(f_scores_normalized[result_df.columns.get_loc(column)] for column in relevant_columns)
    feature_importance_dict[col] = importance_sum

# Print summed feature importances
print("Summed Feature Importances for One-Hot Encoded Features:")
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")


Testing Set Accuracy  : 0.9772727272727273
Normalized Feature Importance Scores (Correlation with Target):
A1_Score: 0.2155712534826406
A2_Score: 0.07715953073470126
A3_Score: 0.4269314046081445
A4_Score: 1.0
A5_Score: 0.3664031678259957
A6_Score: 0.4214406147034598
A7_Score: 0.1445611526447056
A8_Score: 0.39459236364123423
A9_Score: 0.4160365513392855
A10_Score: 0.3664031678259957
age_10: 0.003418928045186629
age_11: 0.014697925464527032
age_4: 0.0001649824018771272
age_5: 0.0026261082117705853
age_6: 0.0003683815357747539
age_7: 0.0015278616110623532
age_8: 0.007042315366456824
age_9: 0.014697925464527032
age_?: 0.0
gender_f: 0.0016479492187500802
gender_m: 0.0016479492187498018
ethnicity_?: 0.004794034090909089
ethnicity_Asian: 0.0011381519784172253
ethnicity_Black: 0.0024896056866952634
ethnicity_Hispanic: 0.033790958737864064
ethnicity_Latino: 0.015117866192084935
ethnicity_Middle Eastern : 0.0
ethnicity_Others: 0.006014634936635936
ethnicity_Pasifika: 0.0
ethnicity_South Asian: 0

  f = msb / msw
