In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Read the dataset
df = pd.read_csv('dataset.csv')

# Preprocess the dataset
data1 = df.drop(['Case_No', 'Sex', 'Ethnicity'], axis=1)
data1 = data1.rename(columns={"Class/ASD Traits": "Class"})

# Encode categorical string values
le = LabelEncoder()
for col in data1.columns:
    if data1[col].dtype == 'object':
        data1[col] = le.fit_transform(data1[col])

le1 = LabelEncoder()
df['Sex'] = le1.fit_transform(df['Sex'])
df['Ethnicity'] = le1.fit_transform(df['Ethnicity'])

ip = data1.drop(['Class'], axis=1)
op = data1['Class']

categorical_features = [0, 1]  # Indices of categorical features
numerical_features = list(range(2, ip.shape[1]))  # Indices of numerical features

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
ip = pipeline.fit_transform(ip)

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(ip, op, test_size=0.2, random_state=42)

# Train the models
# ... (rest of the code remains the same) ...
# Standardize the numerical features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Train the KNN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = metrics.accuracy_score(y_test, y_pred_knn)
recall_knn = metrics.recall_score(y_test, y_pred_knn)
print("Accuracy for KNN:", accuracy_knn)
print("Recall for KNN:", recall_knn)

# Train the Random Forest model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
accuracy_rf = metrics.accuracy_score(y_test, y_pred_rf)
recall_rf = metrics.recall_score(y_test, y_pred_rf)
print("Accuracy for Random Forest:", accuracy_rf)
print("Recall for Random Forest:", recall_rf)

# Hybrid model (KNN + Random Forest)
ensemble = VotingClassifier(estimators=[('knn', knn), ('rf', rf)], voting='soft')
ensemble.fit(X_train, y_train)
y_pred_ensemble = ensemble.predict(X_test)
accuracy_ensemble = metrics.accuracy_score(y_test, y_pred_ensemble)
recall_ensemble = metrics.recall_score(y_test, y_pred_ensemble)
print("Accuracy for KNN + Random Forest ensemble:", accuracy_ensemble)
print("Recall for KNN + Random Forest ensemble:", recall_ensemble)

Accuracy for KNN: 0.9620853080568721
Recall for KNN: 0.9577464788732394
Accuracy for Random Forest: 1.0
Recall for Random Forest: 1.0
Accuracy for KNN + Random Forest ensemble: 0.990521327014218
Recall for KNN + Random Forest ensemble: 0.9859154929577465
