In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score,r2_score,confusion_matrix
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [35]:
df = pd.read_csv('/kaggle/input/autismprediction/train.csv')
df.head()

Unnamed: 0,ID,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,1,0,1,0,1,0,1,...,f,?,no,no,Austria,no,6.351166,18 and more,Self,0
1,2,0,0,0,0,0,0,0,0,0,...,m,?,no,no,India,no,2.255185,18 and more,Self,0
2,3,1,1,1,1,1,1,1,1,1,...,m,White-European,no,yes,United States,no,14.851484,18 and more,Self,1
3,4,0,0,0,0,0,0,0,0,0,...,f,?,no,no,United States,no,2.276617,18 and more,Self,0
4,5,0,0,0,0,0,0,0,0,0,...,m,?,no,no,South Africa,no,-4.777286,18 and more,Self,0


* We don't need ID as it is just an index count  
* As age_desc has only one value, it doesn't make any difference  
* The result col is not reliable, since it contains invalid values (negative and >10), so we can either drop it or recompute it directly from Q1–Q10  
* The 'relation' column (relation of the person filling the form) does not directly affect the diagnosis and can be ignored  
* The 'used_app_before' column is not relevant to the actual condition and can be removed  
* Features like 'gender', 'ethnicity', 'jaundice', 'austim', and 'contry_of_res' may hold some correlation with the condition and will be retained  
* The most important features are Q1–Q10, which represent the actual screening test questions and directly contribute to the autism score

In [36]:
df = df.drop(['ID','age_desc','relation','used_app_before','result'],axis = 1)
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,Class/ASD
0,1,0,1,0,1,0,1,0,1,1,38.172746,f,?,no,no,Austria,0
1,0,0,0,0,0,0,0,0,0,0,47.750517,m,?,no,no,India,0
2,1,1,1,1,1,1,1,1,1,1,7.380373,m,White-European,no,yes,United States,1
3,0,0,0,0,0,0,0,0,0,0,23.561927,f,?,no,no,United States,0
4,0,0,0,0,0,0,0,0,0,0,43.20579,m,?,no,no,South Africa,0


In [37]:
for col in df.columns:
    if col not in ['age']:
        print(col, df[col].unique())
        print("-" * 40)

A1_Score [1 0]
----------------------------------------
A2_Score [0 1]
----------------------------------------
A3_Score [1 0]
----------------------------------------
A4_Score [0 1]
----------------------------------------
A5_Score [1 0]
----------------------------------------
A6_Score [0 1]
----------------------------------------
A7_Score [1 0]
----------------------------------------
A8_Score [0 1]
----------------------------------------
A9_Score [1 0]
----------------------------------------
A10_Score [1 0]
----------------------------------------
gender ['f' 'm']
----------------------------------------
ethnicity ['?' 'White-European' 'Middle Eastern ' 'Pasifika' 'Black' 'Others'
 'Hispanic' 'Asian' 'Turkish' 'South Asian' 'Latino' 'others']
----------------------------------------
jaundice ['no' 'yes']
----------------------------------------
austim ['no' 'yes']
----------------------------------------
contry_of_res ['Austria' 'India' 'United States' 'South Africa' 'Jordan'
 '

In [38]:
df['ethnicity'] = df['ethnicity'].replace(['?','others','Others'], 'Others')
df['age'] = df['age'].astype(int)
df['contry_of_res'] = df['contry_of_res'].replace(['Viet Nam'], 'Vietnam')
df['contry_of_res'] = df['contry_of_res'].replace(['China','Hong Kong'], 'China')
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,Class/ASD
0,1,0,1,0,1,0,1,0,1,1,38,f,Others,no,no,Austria,0
1,0,0,0,0,0,0,0,0,0,0,47,m,Others,no,no,India,0
2,1,1,1,1,1,1,1,1,1,1,7,m,White-European,no,yes,United States,1
3,0,0,0,0,0,0,0,0,0,0,23,f,Others,no,no,United States,0
4,0,0,0,0,0,0,0,0,0,0,43,m,Others,no,no,South Africa,0


In [44]:
for col in df.columns:
    if col not in ['age','result'] and 'Score' not in col:
        print(col, df[col].unique())
        print("-" * 40)
print(df.info())

gender ['f' 'm']
----------------------------------------
ethnicity ['Others' 'White-European' 'Middle Eastern ' 'Pasifika' 'Black' 'Hispanic'
 'Asian' 'Turkish' 'South Asian' 'Latino']
----------------------------------------
jaundice ['no' 'yes']
----------------------------------------
austim ['no' 'yes']
----------------------------------------
contry_of_res ['Austria' 'India' 'United States' 'South Africa' 'Jordan'
 'United Kingdom' 'Brazil' 'New Zealand' 'Canada' 'Kazakhstan'
 'United Arab Emirates' 'Australia' 'Ukraine' 'Iraq' 'France' 'Malaysia'
 'Vietnam' 'Egypt' 'Netherlands' 'Afghanistan' 'Oman' 'Italy'
 'AmericanSamoa' 'Bahamas' 'Saudi Arabia' 'Ireland' 'Aruba' 'Sri Lanka'
 'Russia' 'Bolivia' 'Azerbaijan' 'Armenia' 'Serbia' 'Ethiopia' 'Sweden'
 'Iceland' 'China' 'Angola' 'Germany' 'Spain' 'Tonga' 'Pakistan' 'Iran'
 'Argentina' 'Japan' 'Mexico' 'Nicaragua' 'Sierra Leone' 'Czech Republic'
 'Niger' 'Romania' 'Cyprus' 'Belgium' 'Burundi' 'Bangladesh']
--------------------------

****No null values found****

In [42]:
df.isnull().sum()

A1_Score         0
A2_Score         0
A3_Score         0
A4_Score         0
A5_Score         0
A6_Score         0
A7_Score         0
A8_Score         0
A9_Score         0
A10_Score        0
age              0
gender           0
ethnicity        0
jaundice         0
austim           0
contry_of_res    0
Class/ASD        0
dtype: int64

In [43]:
df['Class/ASD'].value_counts()

Class/ASD
0    639
1    161
Name: count, dtype: int64

In [45]:
df.nunique()

A1_Score          2
A2_Score          2
A3_Score          2
A4_Score          2
A5_Score          2
A6_Score          2
A7_Score          2
A8_Score          2
A9_Score          2
A10_Score         2
age              79
gender            2
ethnicity        10
jaundice          2
austim            2
contry_of_res    55
Class/ASD         2
dtype: int64

Now using LabelEncoder, I will transform the String data to int

In [46]:
encoders = {}

for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,Class/ASD
0,1,0,1,0,1,0,1,0,1,1,38,0,5,0,0,7,0
1,0,0,0,0,0,0,0,0,0,0,47,1,5,0,0,24,0
2,1,1,1,1,1,1,1,1,1,1,7,1,9,0,1,53,1
3,0,0,0,0,0,0,0,0,0,0,23,0,5,0,0,53,0
4,0,0,0,0,0,0,0,0,0,0,43,1,5,0,0,45,0


In [47]:
print(df['Class/ASD'].value_counts())

Class/ASD
0    639
1    161
Name: count, dtype: int64


The dataset is highly imbalaced

In [48]:
X = df.drop('Class/ASD',axis=1)
y = df['Class/ASD']

smote = SMOTE(sampling_strategy = 0.8, random_state = 42)
X_smote,y_smote = smote.fit_resample(X,y)

print(X_smote.shape)

(1150, 16)


In [49]:
print(y_smote.value_counts())
X_train,X_test,y_train,y_test = tts(X_smote,y_smote,test_size=0.2,random_state=42)

print('-'*30)
print(y_train.value_counts())
print(y_test.value_counts())

Class/ASD
0    639
1    511
Name: count, dtype: int64
------------------------------
Class/ASD
0    503
1    417
Name: count, dtype: int64
Class/ASD
0    136
1     94
Name: count, dtype: int64


In [52]:
print(X_train.shape)
print(y_train.shape)

(920, 16)
(920,)


In [53]:
X_train.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res
409,0,0,0,0,0,0,0,1,0,0,38,0,7,0,0,51
100,0,0,0,0,0,0,0,0,0,1,18,0,5,0,0,24
168,1,1,0,0,0,0,0,0,0,0,32,0,5,1,0,6
575,1,0,1,1,0,0,1,0,0,1,9,1,5,1,0,53
750,0,0,0,0,0,0,0,0,0,0,21,1,5,0,0,51


In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(probability=True),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [55]:
results = []
best_model = None
best_score = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)

    results.append([name, acc, f1, roc])

    if roc > best_score:
        best_score = roc
        best_model = (name, model)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [56]:
# Make results DataFrame
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "F1-Score", "ROC-AUC"])
print("\n📊 Model Comparison:\n")
print(results_df.sort_values(by="ROC-AUC", ascending=False))


📊 Model Comparison:

                    Model  Accuracy  F1-Score   ROC-AUC
2           Random Forest  0.900000  0.887805  0.955022
3                 XGBoost  0.891304  0.875622  0.945557
0     Logistic Regression  0.782609  0.752475  0.882431
1  Support Vector Machine  0.782609  0.731183  0.849578


In [57]:
import joblib
# Save best model
best_name, best_clf = best_model
joblib.dump(best_clf, f"/kaggle/working/{best_name.replace(' ', '_')}_best_model.pkl")

print(f"\n✅ Best model is {best_name} (ROC-AUC = {best_score:.4f})")


✅ Best model is Random Forest (ROC-AUC = 0.9550)


In [58]:
joblib.dump(encoders, "encoders.pkl")

['encoders.pkl']