In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
df = pd.read_csv("D:\\Sem 6\\Mini Project\\archive\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]
toddlers_df=df
category_features = features.iloc[:, [12, 13, 14, 15, 16]]
features.drop(features.columns[-6:], axis=1, inplace=True)
features = features.fillna(features.mean())
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)
result_df = pd.concat([features, transformed_df], axis=1)
X_train,X_test,y_train,y_test = train_test_split(result_df, labels, test_size=0.19, random_state=42)
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=2),
                               n_estimators=100, learning_rate=0.5, algorithm='SAMME.R', random_state=42)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())
        ]),['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'
)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__learning_rate': [0.1, 0.5, 1.0]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
y_pred_test = grid_search.predict(X_test)
adaboost_preds1 = grid_search.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy:", test_accuracy_test)
from sklearn.metrics import precision_score, f1_score, roc_curve, auc, recall_score, cohen_kappa_score, log_loss, matthews_corrcoef
print("printing precision")
print(precision_score(y_test, y_pred_test, average='macro'))
print("f1-score")
print(f1_score(y_test, y_pred_test,average='macro'))
pipeline.fit(X_train, y_train)
y_prob_test = pipeline.predict_proba(X_test)[:, 1]
y_pred_test = pipeline.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy  :", test_accuracy_test)
from sklearn.metrics import roc_auc_score
y_prob_test = pipeline.predict_proba(X_test)[:, 1]
y_pred_test = pipeline.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy without cross-validation:", test_accuracy_test)
accuracy_nor_ab=test_accuracy_test
roc_auc = roc_auc_score(y_test, y_prob_test)
print("ROC AUC:", roc_auc)
print('recall')
print(recall_score(y_test, y_pred_test,average='macro'))
print('kappa score')
print(cohen_kappa_score(y_test, y_pred_test))
print('log loss')
print(log_loss(y_test, pipeline.predict_proba(X_test)))
print('MCC')
print(matthews_corrcoef(y_test, y_pred_test))

Best Parameters: {'classifier__learning_rate': 1.0, 'classifier__n_estimators': 150}
Testing Set Accuracy: 0.9900497512437811
printing precision
0.9885378649635037
f1-score
0.9885378649635037
Testing Set Accuracy  : 0.9701492537313433
Testing Set Accuracy without cross-validation: 0.9701492537313433
ROC AUC: 0.9977189781021898
recall
0.9614507299270073
kappa score
0.9306497987349052
log loss
0.301509459733788
MCC
0.9308985603321833


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
df = pd.read_csv("D:\\Sem 6\\Mini Project\\Child\\csv_result-Autism-Child-Data.csv")
df.drop(df.columns[0], axis=1, inplace=True)
child_df=df
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]
category_features = features.iloc[:, [10, 11, 12, 13, 14, 15, 16,17, 18, 19]]
features.drop(features.columns[-10:], axis=1, inplace=True)
features = features.fillna(features.mean())
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)
result_df = pd.concat([features, transformed_df], axis=1)
X_train,X_test,y_train,y_test = train_test_split(result_df, labels, test_size=0.3, random_state=41)
ada_boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2, min_samples_split=2), n_estimators=32, learning_rate=0.1, algorithm='SAMME.R', random_state=35)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())
        ]),['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), category_encoded_columns)
    ],
    remainder='passthrough'
)
pipeline_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])
cv = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline_cv, X_train, y_train, cv=cv, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))
pipeline_cv.fit(X_train, y_train)
y_pred_test = pipeline_cv.predict(X_test)
adaboost_preds2 = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy with cross-validation:", test_accuracy_test)
from sklearn.metrics import precision_score, f1_score, roc_curve, auc, recall_score, cohen_kappa_score, log_loss, matthews_corrcoef
print("printing precision")
print(precision_score(y_test, y_pred_test, average='macro'))
print("f1-score")
print(f1_score(y_test, y_pred_test,average='macro'))
y_prob_test = pipeline_cv.predict_proba(X_test)[:, 1]
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy without cross-validation:", test_accuracy_test)
from sklearn.metrics import roc_auc_score
y_prob_test = pipeline_cv.predict_proba(X_test)[:, 1]
y_pred_test = pipeline_cv.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy without cross-validation:", test_accuracy_test)
accuracy_nor_ab=test_accuracy_test
roc_auc = roc_auc_score(y_test, y_prob_test)
print("ROC AUC:", roc_auc)
print('recall')
print(recall_score(y_test, y_pred_test,average='macro'))
print('kappa score')
print(cohen_kappa_score(y_test, y_pred_test))
print('log loss')
print(log_loss(y_test, pipeline_cv.predict_proba(X_test)))
print('MCC')
print(matthews_corrcoef(y_test, y_pred_test))



Cross-validation scores: [1.   1.   1.   1.   0.95 1.   1.   1.   0.95 1.  ]
Mean CV accuracy: 0.99




Testing Set Accuracy with cross-validation: 0.9772727272727273
printing precision
0.9821428571428572
f1-score
0.9757575757575758
Testing Set Accuracy without cross-validation: 0.9772727272727273
Testing Set Accuracy without cross-validation: 0.9772727272727273
ROC AUC: 0.9733115468409586
recall
0.9705882352941176
kappa score
0.9515418502202643
log loss
0.19739487111688303
MCC
0.9526610232449336


In [4]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, cohen_kappa_score, log_loss, matthews_corrcoef
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
data, meta = arff.loadarff("D:\\Sem 6\\Mini Project\\autistic+spectrum+disorder+screening+data+for+adolescent\\Autism-Adolescent-Data.arff")
dtype_mapping = {
    'A1_Score': 'bool',
    'A2_Score': 'bool',
    'A3_Score': 'bool',
    'A4_Score': 'bool',
    'A5_Score': 'bool',
    'A6_Score': 'bool',
    'A7_Score': 'bool',
    'A8_Score': 'bool',
    'A9_Score': 'bool',
    'A10_Score': 'bool',
    'age': 'float',
    'gender': 'str',
    'ethnicity': 'str',
    'jundice': 'bool',
    'austim': 'bool',
    'contry_of_res': 'str',
    'used_app_before': 'bool',
    'result': 'float',
    'age_desc': 'str',
    'relation': 'str',
    'Class/ASD': 'str' 
}
for attr in meta.names():
    data[attr] = np.char.strip(np.char.mod('%s', data[attr].astype(str)))
    data[attr][data[attr] == ''] = np.nan
for attr in meta.names():
    if meta[attr][0] == 'nominal':
        data[attr] = data[attr].astype(str)
df = pd.DataFrame(data, columns=meta.names())
df = df.astype(dtype_mapping)
adolescent_df=df
nominal_columns = [col for col in df.columns if df[col].dtype == 'object']
categorical_df = df[nominal_columns]
for col in categorical_df.columns:
    mode_val = categorical_df[col].mode()[0]
    categorical_df[col].fillna(mode_val, inplace=True)
non_categorical_columns = [col for col in df.columns if col not in nominal_columns]
non_categorical_df = df[non_categorical_columns]
bool_columns_with_missing = [col for col in non_categorical_df.columns if non_categorical_df[col].dtype == 'bool' and non_categorical_df[col].isnull().any()]
if bool_columns_with_missing:
    print("Missing values found in columns with bool values. Cannot proceed with mean value imputation.")
else:
    float_columns = [col for col in non_categorical_df.columns if non_categorical_df[col].dtype == 'float64']
    non_categorical_df[float_columns] = non_categorical_df[float_columns].fillna(non_categorical_df[float_columns].mean())
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_data = encoder.fit_transform(categorical_df)
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_df.columns))
joined_df = pd.concat([non_categorical_df, encoded_df], axis=1)
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import QuantileTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
X = joined_df.iloc[:, :-1] 
y = joined_df.iloc[:, -1]   
accuracy_list={}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=210)
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(y)
class_counts = np.bincount(labels_encoded)
prior_probabilities = class_counts / len(labels_encoded)
means = np.mean(prior_probabilities, axis=0)  
variances = np.var(prior_probabilities, axis=0)  
pipeline = Pipeline([
    ('transformer', QuantileTransformer(n_quantiles=35,output_distribution='uniform',subsample=60, random_state=91)),
    ('oversampler', RandomOverSampler(random_state=12)),
    ('classifier', LinearDiscriminantAnalysis(solver='svd',priors=prior_probabilities, store_covariance=True, tol=0.99999999))
])
pipeline.fit(X_train, y_train)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=50)
accuracy_scores = cross_val_score(pipeline, X_train, y_train, cv=cv)
print("\nCross-validation Accuracy (mean):", accuracy_scores.mean())
y_pred = pipeline.predict(X_test)
lda_preds1 = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on testing dataset(QT_LDA):", accuracy)
accuracy_qt_lda=accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
kappa = cohen_kappa_score(y_test, y_pred)
logloss = log_loss(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
print("Other Parameters")
print("Precision :", precision)
print("Recall :", recall)
print("ROC AUC :", roc_auc)
print("F1-score :", f1)
print("Kappa :", kappa)
print("Log Loss :", logloss)
print("MCC :", mcc)


  data[attr][data[attr] == ''] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_df[col].fillna(mode_val, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_categorical_df[float_columns] = non_categorical_df[float_columns].fillna(non_categorical_df[float_columns].mean())



Cross-validation Accuracy (mean): 0.8866666666666667
Accuracy on testing dataset(QT_LDA): 0.9615384615384616
Other Parameters
Precision : 0.9705882352941176
Recall : 0.9705882352941176
ROC AUC : 0.9575163398692811
F1-score : 0.9705882352941176
Kappa : 0.9150326797385621
Log Loss : 1.3862943611198906
MCC : 0.9150326797385621


In [5]:
import pandas as pd
import numpy as np
from scipy.io import arff
data, meta = arff.loadarff("D:\\Sem 6\\Mini Project\\autism+screening+adult\\Autism-Adult-Data.arff")
dtype_mapping = {
    'A1_Score': 'bool',
    'A2_Score': 'bool',
    'A3_Score': 'bool',
    'A4_Score': 'bool',
    'A5_Score': 'bool',
    'A6_Score': 'bool',
    'A7_Score': 'bool',
    'A8_Score': 'bool',
    'A9_Score': 'bool',
    'A10_Score': 'bool',
    'age': 'float',
    'gender': 'str',
    'ethnicity': 'str',
    'jundice': 'bool',
    'austim': 'bool',
    'contry_of_res': 'str',
    'used_app_before': 'bool',
    'result': 'float',
    'age_desc': 'str',
    'relation': 'str',
    'Class/ASD': 'str'  
}
for attr in meta.names():
    data[attr] = np.char.strip(np.char.mod('%s', data[attr].astype(str)))
    data[attr][data[attr] == ''] = np.nan
for attr in meta.names():
    if meta[attr][0] == 'nominal':
        data[attr] = data[attr].astype(str)
dtype_tuples = [(col, dtype_mapping[col]) for col in meta.names()]
df = pd.DataFrame(data, columns=meta.names())
df = df.astype(dtype_mapping)
adult_df=df
nominal_columns = [col for col in df.columns if df[col].dtype == 'object']
categorical_df = df[nominal_columns]
for col in categorical_df.columns:
    mode_val = categorical_df[col].mode()[0]
    categorical_df[col].fillna(mode_val, inplace=True)
non_categorical_columns = [col for col in df.columns if col not in nominal_columns]
non_categorical_df = df[non_categorical_columns]
bool_columns_with_missing = [col for col in non_categorical_df.columns if non_categorical_df[col].dtype == 'bool' and non_categorical_df[col].isnull().any()]
if bool_columns_with_missing:
    print("Missing values found in columns with bool values. Cannot proceed with mean value imputation.")
else:
    float_columns = [col for col in non_categorical_df.columns if non_categorical_df[col].dtype == 'float64']
    non_categorical_df[float_columns] = non_categorical_df[float_columns].fillna(non_categorical_df[float_columns].mean())
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_data = encoder.fit_transform(categorical_df)
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_df.columns))
joined_df = pd.concat([non_categorical_df, encoded_df], axis=1)
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import QuantileTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
X = joined_df.iloc[:, :-1]  
y = joined_df.iloc[:, -1]    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(y)
class_counts = np.bincount(labels_encoded)
prior_probabilities = class_counts / len(labels_encoded)
means = np.mean(prior_probabilities, axis=0) 
variances = np.var(prior_probabilities, axis=0)
pipeline = Pipeline([
    ('transformer', QuantileTransformer(n_quantiles=84,output_distribution='uniform',subsample=350, random_state=15)),
    ('oversampler', RandomOverSampler(random_state=4)),
    ('classifier', LinearDiscriminantAnalysis(solver='lsqr',shrinkage=0.25,priors=prior_probabilities, store_covariance=True, tol=0.00009))
])
pipeline.fit(X_train, y_train)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_scores = cross_val_score(pipeline, X_train, y_train, cv=cv)
y_pred = pipeline.predict(X_test)
lda_preds2 = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy on testing dataset:", accuracy)
accuracy_qt_lda=accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
kappa = cohen_kappa_score(y_test, y_pred)
logloss = log_loss(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
print("Other Parameters")
print("Precision :", precision)
print("Recall :", recall)
print("ROC AUC :", roc_auc)
print("F1-score :", f1)
print("Kappa :", kappa)
print("Log Loss :", logloss)
print("MCC :", mcc)


  data[attr][data[attr] == ''] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_df[col].fillna(mode_val, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_categorical_df[float_columns] = non_categorical_df[float_columns].fillna(non_categorical_df[float_columns].mean())



Accuracy on testing dataset: 0.9929078014184397
Other Parameters
Precision : 0.9743589743589743
Recall : 1.0
ROC AUC : 0.9951456310679612
F1-score : 0.9870129870129869
Kappa : 0.9821360699353858
Log Loss : 0.25562874744054737
MCC : 0.9822928170822647


In [7]:
adaboost_preds1=adaboost_preds1[:52]
adaboost_preds2=adaboost_preds2[:52]
lda_preds1=lda_preds1[:53]
lda_preds2=adaboost_preds2[:52]
def generate_synthetic_labels(predictions):
    # Example: Combine predictions using a simple rule-based approach
    synthetic_labels = []
    for pred in predictions:
        # Example: Combine predictions using a majority voting scheme
        if np.count_nonzero(pred) > len(pred) / 2:
            synthetic_labels.append('Yes')
        else:
            synthetic_labels.append('No')
    return synthetic_labels
    
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
meta_features = np.column_stack((adaboost_preds1, adaboost_preds2, lda_preds1, lda_preds2))
for i in range(0,len(meta_features)):
    for j in range(0,4):
        if meta_features[i][j]=='Yes'or meta_features[i][j]=='YES':
            meta_features[i][j]=1
        else:
            meta_features[i][j]=0
            
synthetic_labels = generate_synthetic_labels(meta_features)
X_train_meta, X_val_meta, y_train_meta, y_val_meta = train_test_split(meta_features, synthetic_labels, test_size=0.27, random_state=234)
meta_model = LinearDiscriminantAnalysis(solver='svd',priors=prior_probabilities, store_covariance=True, tol=0.999999999991)
meta_model.fit(X_train_meta, y_train_meta)
val_predictions = meta_model.predict(X_val_meta)
accuracy = accuracy_score(y_val_meta, val_predictions)
print("Meta-model validation accuracy:", accuracy)


Meta-model validation accuracy: 0.9333333333333333
