In [3]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Load ARFF file
data, meta = arff.loadarff("D:\\Sem 6\\Mini Project\\autistic+spectrum+disorder+screening+data+for+adolescent\\Autism-Adolescent-Data.arff")

# Define data types mapping
dtype_mapping = {
    'A1_Score': 'bool',
    'A2_Score': 'bool',
    'A3_Score': 'bool',
    'A4_Score': 'bool',
    'A5_Score': 'bool',
    'A6_Score': 'bool',
    'A7_Score': 'bool',
    'A8_Score': 'bool',
    'A9_Score': 'bool',
    'A10_Score': 'bool',
    'age': 'float',
    'gender': 'str',
    'ethnicity': 'str',
    'jundice': 'bool',
    'austim': 'bool',
    'contry_of_res': 'str',
    'used_app_before': 'bool',
    'result': 'float',
    'age_desc': 'str',
    'relation': 'str',
    'Class/ASD': 'str'  # Assuming this is your target variable
}

# Replace missing value symbols ('?' or '') with NaN
for attr in meta.names():
    data[attr] = np.char.strip(np.char.mod('%s', data[attr].astype(str)))
    data[attr][data[attr] == ''] = np.nan

# Convert nominal attributes to strings
for attr in meta.names():
    if meta[attr][0] == 'nominal':
        data[attr] = data[attr].astype(str)

# Convert dtype_mapping to list of tuples
dtype_tuples = [(col, dtype_mapping[col]) for col in meta.names()]

# Convert to DataFrame with specified data types
df = pd.DataFrame(data, columns=meta.names())

# Apply the specified data types
df = df.astype(dtype_mapping)

# Separate columns with nominal values into categorical_df
nominal_columns = [col for col in df.columns if df[col].dtype == 'object']
categorical_df = df[nominal_columns]

# Fill missing values in categorical columns with mode
for col in categorical_df.columns:
    mode_val = categorical_df[col].mode()[0]
    categorical_df[col].fillna(mode_val, inplace=True)

# Separate remaining columns into non_categorical_df
non_categorical_columns = [col for col in df.columns if col not in nominal_columns]
non_categorical_df = df[non_categorical_columns]

# Check for missing values in columns with bool values
bool_columns_with_missing = [col for col in non_categorical_df.columns if non_categorical_df[col].dtype == 'bool' and non_categorical_df[col].isnull().any()]
if bool_columns_with_missing:
    print("Missing values found in columns with bool values. Cannot proceed with mean value imputation.")
else:
    # Apply mean value imputation to float columns in non_categorical_df
    float_columns = [col for col in non_categorical_df.columns if non_categorical_df[col].dtype == 'float64']
    non_categorical_df[float_columns] = non_categorical_df[float_columns].fillna(non_categorical_df[float_columns].mean())


import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Assuming categorical_df contains the one-hot encoded categorical columns
# and non_categorical_df contains the bool and float columns

# One-hot encode the categorical columns

encoder = OneHotEncoder(drop='first', sparse=False)
encoded_data = encoder.fit_transform(categorical_df)
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_df.columns))

# Join encoded categorical columns with bool and float columns

joined_df = pd.concat([non_categorical_df, encoded_df], axis=1)

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import QuantileTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder

# Assuming joined_df contains the joined DataFrame

# Separate features and labels
X = joined_df.iloc[:, :-1]  # Features (all columns except the last one)
y = joined_df.iloc[:, -1]   # Labels (last column)

accuracy_list={}
# Split the data into training and testing sets
  
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=210)
# 7
#0.5
        
# Instantiate LabelEncoder
label_encoder = LabelEncoder()
        
# Encode string labels into numerical values
labels_encoded = label_encoder.fit_transform(y)
        
# Now, you can calculate class counts
class_counts = np.bincount(labels_encoded)
        
# Calculate prior probabilities based on class proportions
prior_probabilities = class_counts / len(labels_encoded)
        
 # Fit Gaussian distributions to the prior probabilities
means = np.mean(prior_probabilities, axis=0)  # Calculate mean for each class
variances = np.var(prior_probabilities, axis=0)  # Calculate variance for each class
        
    
# Define the pipeline
pipeline = Pipeline([
    ('transformer', QuantileTransformer(n_quantiles=35,output_distribution='uniform',subsample=60, random_state=91)),
    ('oversampler', RandomOverSampler(random_state=12)),
    ('classifier', LinearDiscriminantAnalysis(solver='svd',priors=prior_probabilities, store_covariance=True, tol=0.99999999))
])
        
    # Fit the pipeline on the training data
pipeline.fit(X_train, y_train)
        
    # Perform 10-fold cross-validation on training data
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=50)
accuracy_scores = cross_val_score(pipeline, X_train, y_train, cv=cv)
        
    # Evaluate the model on the testing data
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy on testing dataset:", accuracy)
# accuracy_list[i]=accuracy
    
# Find the key with the maximum value
# max_key = max(accuracy_list, key=accuracy_list.get)

# Find the maximum value
# max_value = accuracy_list[max_key]

# Print the result
# print("Key with maximum value:", max_key)
# print("Maximum value:", max_value)



  data[attr][data[attr] == ''] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_df[col].fillna(mode_val, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_categorical_df[float_columns] = non_categorical_df[float_columns].fillna(non_categorical_df[float_columns].mean())



Accuracy on testing dataset: 0.9615384615384616


In [22]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Define the number of features to select based on Information Gain
k = 'all'  # Adjust as needed

# Instantiate the SelectKBest transformer with mutual_info_classif as the scoring function
ig_selector = SelectKBest(score_func=mutual_info_classif, k=k)

# Fit the SelectKBest transformer on the training data
X_train_selected = ig_selector.fit_transform(X_train, y_train)

# Get the selected feature indices
selected_feature_indices = ig_selector.get_support(indices=True)

# Get the feature names
feature_names = X_train.columns[selected_feature_indices]

from sklearn.preprocessing import MinMaxScaler

# Train the LDA model using the selected features
lda = LinearDiscriminantAnalysis(solver='svd', priors=prior_probabilities, store_covariance=True, tol=0.99999999)
lda.fit(X_train_selected, y_train)

# Get the feature importance of each selected feature
feature_importance = lda.coef_[0]

original_feature_importance = {}

# Sum up the feature importances for original features
for feature_name, importance in zip(feature_names, lda.coef_[0]):
    original_feature_name = feature_name.split('_')[0]  # Extract original feature name
    if original_feature_name not in original_feature_importance:
        original_feature_importance[original_feature_name] = importance
    else:
        original_feature_importance[original_feature_name] += importance

# Scale feature importances to be between 0 and 1 using Min-Max scaling
scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_importance = scaler.fit_transform(pd.DataFrame(original_feature_importance.values()))

# Print the scaled feature importance of each original feature
print("Scaled Feature Importance for Original Features:(Using Info Gain Attribute Evaluator)")
for original_feature_name, importance in zip(original_feature_importance.keys(), scaled_importance):
    print(f"{original_feature_name}: {importance[0]}")
    


Scaled Feature Importance for Original Features:(Using Info Gain Attribute Evaluator)
A1: 0.06592079950177052
A2: 0.06592079950177057
A3: 0.06592079950177052
A4: 0.06592079950177059
A5: 0.06592079950177059
A6: 0.06592079950177045
A7: 0.06592079950177059
A8: 0.06592079950177061
A9: 0.06592079950177043
A10: 0.06592079950177049
age: -0.1597517895768759
jundice: 0.06592079950177057
austim: 0.06592079950177056
used: 0.06592079950177043
result: 0.6044533969846521
gender: -0.1551461047354774
ethnicity: -1.0
contry: 0.9999999999999999
relation: -0.7385077869458502


In [28]:
# Define the custom gain ratio attribute evaluator function
def gain_ratio_attribute_evaluator(X, y):
    lda = LDA()
    lda.fit(X, y)
    projections = lda.transform(X)
    
    # Calculate mutual information for each feature
    mutual_infos = []
    for i in range(projections.shape[1]):
        mi = mutual_info_score(projections[:, i], y)
        mutual_infos.append(mi)
    
    # Calculate total mutual information
    total_mi = sum(mutual_infos)
    
    # Calculate split information
    split_info = -sum((np.sum(y == c) / len(y)) * np.log2(np.sum(y == c) / len(y)) for c in np.unique(y))
    
    # Calculate gain ratio for each feature
    gain_ratios = []
    for mi in mutual_infos:
        gain_ratio = mi / split_info if split_info != 0 else 0
        gain_ratios.append(gain_ratio)
    
    # Return gain ratios with None as p-values
    return gain_ratios, [None] * len(gain_ratios)

from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Define the number of features to select based on Information Gain
k = 'all'  # Adjust as needed

# Instantiate the SelectKBest transformer with mutual_info_classif as the scoring function
ig_selector = SelectKBest(score_func=mutual_info_classif, k=k)

# Fit the SelectKBest transformer on the training data
X_train_selected = ig_selector.fit_transform(X_train, y_train)

# Get the selected feature indices
selected_feature_indices = ig_selector.get_support(indices=True)

# Get the feature names
feature_names = X_train.columns[selected_feature_indices]

from sklearn.preprocessing import MinMaxScaler

# Train the LDA model using the selected features
lda = LinearDiscriminantAnalysis(solver='svd', priors=prior_probabilities, store_covariance=True, tol=0.99999999)
lda.fit(X_train_selected, y_train)

# Get the feature importance of each selected feature
feature_importance = lda.coef_[0]

original_feature_importance = {}

# Sum up the feature importances for original features
for feature_name, importance in zip(feature_names, lda.coef_[0]):
    original_feature_name = feature_name.split('_')[0]  # Extract original feature name
    if original_feature_name not in original_feature_importance:
        original_feature_importance[original_feature_name] = importance
    else:
        original_feature_importance[original_feature_name] += importance

# Scale feature importances to be between 0 and 1 using Min-Max scaling
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_importance = scaler.fit_transform(pd.DataFrame(original_feature_importance.values()))

# Print the scaled feature importance of each original feature
print("Scaled Feature Importance for Original Features:(Using Info Gain Attribute Evaluator)")
for original_feature_name, importance in zip(original_feature_importance.keys(), scaled_importance):
    print(f"{original_feature_name}: {importance[0]}")
    


Scaled Feature Importance for Original Features:(Using Info Gain Attribute Evaluator)
A1: 0.5329603997508853
A2: 0.5329603997508853
A3: 0.5329603997508853
A4: 0.5329603997508853
A5: 0.5329603997508853
A6: 0.5329603997508853
A7: 0.5329603997508853
A8: 0.5329603997508853
A9: 0.5329603997508853
A10: 0.5329603997508853
age: 0.42012410521156207
jundice: 0.5329603997508853
austim: 0.5329603997508853
used: 0.5329603997508853
result: 0.8022266984923261
gender: 0.4224269476322613
ethnicity: 0.0
contry: 1.0
relation: 0.1307461065270749


In [42]:
non_convertible_columns = []

for col in X_train.columns:
    try:
        X_train[col] = pd.to_numeric(X_train[col])
    except ValueError:
        non_convertible_columns.append(col)

if non_convertible_columns:
    print("Non-convertible columns found:", non_convertible_columns)
else:
    print("All columns successfully converted to numeric format.")

print(X_train.dtypes)

All columns successfully converted to numeric format.
A1_Score                                bool
A2_Score                                bool
A3_Score                                bool
A4_Score                                bool
A5_Score                                bool
                                      ...   
relation_Health care professional    float64
relation_Others                      float64
relation_Parent                      float64
relation_Relative                    float64
relation_Self                        float64
Length: 62, dtype: object


In [43]:
from skrebate import ReliefF

# Instantiate the ReliefF feature selector
relieff_selector = ReliefF()

# Fit ReliefF to the training data
relieff_selector.fit(X_train.values, y_train.values)

# Get the indices of selected features
selected_feature_indices = relieff_selector.top_features_

# Select the corresponding columns from X_train and X_test
X_train_selected = X_train.iloc[:, selected_feature_indices]
X_test_selected = X_test.iloc[:, selected_feature_indices]

# Now, retrain your pipeline with the selected features
pipeline.fit(X_train_selected, y_train)

# Evaluate the model on the testing data with selected features
y_pred_selected = pipeline.predict(X_test_selected)
accuracy_selected = accuracy_score(y_test, y_pred_selected)
print("\nAccuracy on testing dataset with selected features:", accuracy_selected)


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [46]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, QuantileTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, cohen_kappa_score, log_loss, matthews_corrcoef
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from skrebate import ReliefF

# Load ARFF file
data, meta = arff.loadarff("D:\\Sem 6\\Mini Project\\autistic+spectrum+disorder+screening+data+for+adolescent\\Autism-Adolescent-Data.arff")

# Define data types mapping
dtype_mapping = {
    'A1_Score': 'bool',
    'A2_Score': 'bool',
    'A3_Score': 'bool',
    'A4_Score': 'bool',
    'A5_Score': 'bool',
    'A6_Score': 'bool',
    'A7_Score': 'bool',
    'A8_Score': 'bool',
    'A9_Score': 'bool',
    'A10_Score': 'bool',
    'age': 'float',
    'gender': 'str',
    'ethnicity': 'str',
    'jundice': 'bool',
    'austim': 'bool',
    'contry_of_res': 'str',
    'used_app_before': 'bool',
    'result': 'float',
    'age_desc': 'str',
    'relation': 'str',
    'Class/ASD': 'str'  # Assuming this is your target variable
}

# Replace missing value symbols ('?' or '') with NaN
for attr in meta.names():
    data[attr] = np.char.strip(np.char.mod('%s', data[attr].astype(str)))
    data[attr][data[attr] == ''] = np.nan

# Convert nominal attributes to strings
for attr in meta.names():
    if meta[attr][0] == 'nominal':
        data[attr] = data[attr].astype(str)

# Convert to DataFrame with specified data types
df = pd.DataFrame(data, columns=meta.names())

# Apply the specified data types
df = df.astype(dtype_mapping)

# Separate columns with nominal values into categorical_df
nominal_columns = [col for col in df.columns if df[col].dtype == 'object']
categorical_df = df[nominal_columns]

# Fill missing values in categorical columns with mode
for col in categorical_df.columns:
    mode_val = categorical_df[col].mode()[0]
    categorical_df.loc[:, col] = categorical_df[col].fillna(mode_val)

# Separate remaining columns into non_categorical_df
non_categorical_columns = [col for col in df.columns if col not in nominal_columns]
non_categorical_df = df[non_categorical_columns]

# Check for missing values in columns with bool values
bool_columns_with_missing = [col for col in non_categorical_df.columns if non_categorical_df[col].dtype == 'bool' and non_categorical_df[col].isnull().any()]
if bool_columns_with_missing:
    print("Missing values found in columns with bool values. Cannot proceed with mean value imputation.")
else:
    # Apply mean value imputation to float columns in non_categorical_df
    float_columns = [col for col in non_categorical_df.columns if non_categorical_df[col].dtype == 'float64']
    non_categorical_df.loc[:, float_columns] = non_categorical_df[float_columns].fillna(non_categorical_df[float_columns].mean())

# One-hot encode the categorical columns
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_data = encoder.fit_transform(categorical_df)
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_df.columns))

# Join encoded categorical columns with bool and float columns
joined_df = pd.concat([non_categorical_df, encoded_df], axis=1)

# Separate features and labels
X = joined_df.iloc[:, :-1]  # Features (all columns except the last one)
y = joined_df.iloc[:, -1]   # Labels (last column)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=210)

# Instantiate LabelEncoder
label_encoder = LabelEncoder()

# Encode string labels into numerical values
labels_encoded = label_encoder.fit_transform(y)

# Now, you can calculate class counts
class_counts = np.bincount(labels_encoded)

# Calculate prior probabilities based on class proportions
prior_probabilities = class_counts / len(labels_encoded)

# Fit Gaussian distributions to the prior probabilities
means = np.mean(prior_probabilities, axis=0)  # Calculate mean for each class
variances = np.var(prior_probabilities, axis=0)  # Calculate variance for each class

# Define the pipeline
pipeline = Pipeline([
    ('transformer', QuantileTransformer(n_quantiles=35, output_distribution='uniform', subsample=60, random_state=91)),
    ('oversampler', RandomOverSampler(random_state=12)),
    ('classifier', LinearDiscriminantAnalysis(solver='svd', priors=prior_probabilities, store_covariance=True, tol=0.99999999))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Perform 10-fold cross-validation on training data
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=50)
accuracy_scores = cross_val_score(pipeline, X_train, y_train, cv=cv)
print("\nCross-validation Accuracy (mean):", accuracy_scores.mean())

# Evaluate the model on the testing data
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on testing dataset(QT_LDA):", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)

# Calculate Cohen's kappa
kappa = cohen_kappa_score(y_test, y_pred)

# Calculate log loss
logloss = log_loss(y_test, y_pred)

# Calculate Matthews correlation coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)

# Print evaluation metrics
print("Other Parameters")
print("Precision :", precision)
print("Recall :", recall)
print("ROC AUC :", roc_auc)
print("F1-score :", f1)
print("Kappa :", kappa)
print("Log Loss :", logloss)
print("MCC :", mcc)

# Instantiate ReliefF
relief = ReliefF(n_neighbors=10)

# Fit ReliefF and transform the data
X_relief = relief.fit_transform(X_train.values, y_train.values)

# Get the indices of the selected features
selected_features_indices = relief.top_features_

# Print the selected features
selected_features = X_train.columns[selected_features_indices]
print("Selected Features:", selected_features)


  data[attr][data[attr] == ''] = np.nan



Cross-validation Accuracy (mean): 0.8866666666666667
Accuracy on testing dataset(QT_LDA): 0.9615384615384616
Other Parameters
Precision : 0.9705882352941176
Recall : 0.9705882352941176
ROC AUC : 0.9575163398692811
F1-score : 0.9705882352941176
Kappa : 0.9150326797385621
Log Loss : 1.3862943611198906
MCC : 0.9150326797385621


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''