# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

import warnings
warnings.filterwarnings('ignore')

In [None]:
df= pd.read_csv("/kaggle/input/autismprediction/train.csv")
test= pd.read_csv("/kaggle/input/autismprediction/test.csv")

In [None]:
df.head()

# Data Preparation and Cleaning

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
df['ethnicity'].value_counts()

In [None]:
df['relation'].value_counts()

In [None]:
df = df.replace({'yes':1, 'no':0, '?':'Others', 'others':'Others'})

In [None]:
plt.pie(df['Class/ASD'].value_counts().values, autopct='%1.1f%%')
plt.show()

In [None]:
ints = []
objects = []
floats = []

for col in df.columns:
  if df[col].dtype == int:
    ints.append(col)
  elif df[col].dtype == object:
    objects.append(col)
  else:
    floats.append(col)

In [None]:
ints

In [None]:
floats

In [None]:
objects

In [None]:
ints.remove('ID')
ints.remove('Class/ASD')

# Exploratory Analysis and Visualization

In [None]:
# Convert the data to long-form using melt
df_melted = df.melt(id_vars=['ID', 'Class/ASD'], value_vars=ints, var_name='col', value_name='value')

plt.subplots(figsize=(15,15))

for i, col in enumerate(ints):
  plt.subplot(5,3,i+1)
  # Use the melted DataFrame and specify x and hue
  sns.countplot(x='value', hue='Class/ASD', data=df_melted[df_melted['col'] == col])

plt.tight_layout()
plt.show()

In [None]:
plt.subplots(figsize=(15, 30))

for i, col in enumerate(objects):
    plt.subplot(5, 3, i+1)
    # Convert the data to long-form for the specific column
    df_melted = df.melt(id_vars=['Class/ASD'], value_vars=[col], var_name='col', value_name='value')

    # Use the melted DataFrame and specify x and hue
    sns.countplot(x='value', hue='Class/ASD', data=df_melted)
    plt.xticks(rotation=60)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(data=df, x='contry_of_res', hue='Class/ASD')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.subplots(figsize=(15,5))

for i, col in enumerate(floats):
  plt.subplot(1,2,i+1)
  sns.distplot(df[col])
plt.tight_layout()
plt.show()

In [None]:
plt.subplots(figsize=(15,5))

for i, col in enumerate(floats):
  plt.subplot(1,2,i+1)
  sns.boxplot(df[col])
plt.tight_layout()
plt.show()

In [None]:
df = df[df['result']>-5]
df.shape

In [None]:
# This functions make groups by taking
# the age as a parameter
def convertAge(age):
    if age < 4:
        return 'Toddler'
    elif age < 12:
        return 'Kid'
    elif age < 18:
        return 'Teenager'
    elif age < 40:
        return 'Young'
    else:
        return 'Senior'

df['ageGroup'] = df['age'].apply(convertAge)


In [None]:
sns.countplot(x=df['ageGroup'], hue=df['Class/ASD'])
plt.show()

In [None]:
def add_feature(data):

  # Creating a column with all values zero
  data['sum_score'] = 0
  for col in data.loc[:,'A1_Score':'A10_Score'].columns:

    # Updating the 'sum_score' value with scores
    # from A1 to A10
    data['sum_score'] += data[col]

  # Creating a random data using the below three columns
  data['ind'] = data['austim'] + data['used_app_before'] + data['jaundice']

  return data

df = add_feature(df)

In [None]:
sns.countplot(x=df['sum_score'], hue=df['Class/ASD'])
plt.show()

In [None]:
# Applying log transformations to remove the skewness of the data.
df['age'] = df['age'].apply(lambda x: np.log(x))

In [None]:
sns.distplot(df['age'])
plt.show()

In [None]:
def encode_labels(data):
    for col in data.columns:

      # Here we will check if datatype
      # is object then we will encode it
      if data[col].dtype == 'object':
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

    return data

df = encode_labels(df)

# Making a heatmap to visualize the correlation matrix
plt.figure(figsize=(10,10))
sns.heatmap(df.corr() > 0.8, annot=True, cbar=False)
plt.title('Correlation Matrix Heatmap') 
plt.show()

# Modeling

In [None]:
removal = ['ID', 'age_desc', 'used_app_before', 'austim']
features = df.drop(removal + ['Class/ASD'], axis=1)
target = df['Class/ASD']

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size = 0.2, random_state=10)

# As the data was highly imbalanced we will balance it by adding repetitive rows of minority class.
ros = RandomOverSampler(sampling_strategy='minority',random_state=0)
X, Y = ros.fit_resample(X_train,Y_train)
X.shape, Y.shape

In [None]:
# Normalizing the features for stable and fast training.
scaler = StandardScaler() 
X = scaler.fit_transform(X) 
X_val = scaler.transform(X_val) 

In [None]:
models = [LogisticRegression(), XGBClassifier(), SVC(kernel='rbf')]

for model in models:
  model.fit(X, Y)

  print(f'{model} : ')
  print('Training Accuracy : ', metrics.roc_auc_score(Y, model.predict(X)))
  print('Validation Accuracy : ', metrics.roc_auc_score(Y_val, model.predict(X_val)))
  print()

In [None]:
models = [LogisticRegression(), XGBClassifier(), SVC(kernel='rbf', probability=True)]
model_names = ['Logistic Regression', 'XGBoost Classifier', 'SVC']

for model, name in zip(models, model_names):
    model.fit(X, Y)
    
    y_pred_train = model.predict(X)
    y_pred_val = model.predict(X_val)
    y_proba_val = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else np.zeros_like(y_pred_val)
    
    accuracy = accuracy_score(Y_val, y_pred_val)
    class_report = classification_report(Y_val, y_pred_val)
    
    print(f'{name} : ')
    print(f'Training Accuracy: {roc_auc_score(Y, model.predict(X)):.2f}')
    print(f'Validation Accuracy: {roc_auc_score(Y_val, model.predict(X_val)):.2f}')
    print()
    
    cm = confusion_matrix(Y_val, y_pred_val)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    plt.figure(figsize=(12, 8))

    plt.suptitle(f'{name}', fontsize=16)
    
    plt.subplot(3, 1, 1)
    plt.axis('off')
    plt.text(0.5, 0.5, f'Training Accuracy: {roc_auc_score(Y, model.predict(X)):.2f}\nValidation Accuracy: {roc_auc_score(Y_val, model.predict(X_val)):.2f}', 
             ha='center', va='center', fontsize=12, bbox=dict(facecolor='white', alpha=0.8))

    plt.subplot(3, 2, 3)
    disp.plot(cmap='Blues', ax=plt.gca())
    plt.title(f'Confusion Matrix for {name}')
    
    if hasattr(model, "predict_proba"):
        fpr, tpr, _ = roc_curve(Y_val, y_proba_val)
        plt.subplot(3, 2, 4)
        plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc_score(Y_val, y_proba_val):.1f})')
        plt.plot([0, 1], [0, 1], linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve for {name}')
        plt.legend(loc='best')
    
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to include suptitle
    plt.show()


**XGBoost Classifier shows a very high training accuracy (1.0), which suggests the possibility of 'overfitting', where the model learns the specific details of the training data excessively, reducing its performance on the validation set.**

**Logistic Regression and SVC show more balanced performance between the training set and validation set. However, SVC slightly outperforms Logistic Regression in validation accuracy (0.804) compared to Logistic Regression (0.782).**

In [None]:
test.head()

In [None]:
test.columns

In [None]:
test.shape

In [None]:
test.isnull().sum()

In [None]:
test.describe().T

In [None]:
test.info()

In [None]:
test = test.replace({'yes':1, 'no':0, '?':'Others', 'others':'Others'})

In [None]:
ints = []
objects = []
floats = []

for col in test.columns:
  if test[col].dtype == int:
    ints.append(col)
  elif test[col].dtype == object:
    objects.append(col)
  else:
    floats.append(col)

In [None]:
ints

In [None]:
objects

In [None]:
floats

In [None]:
test.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

string_columns = ['gender','ethnicity', 'contry_of_res', 'age_desc','relation']

label_encoders = {}

for col in string_columns:
    le = LabelEncoder()
    test[col] = le.fit_transform(test[col])
    label_encoders[col] = le

test.head()

In [None]:
test = test.drop(removal, axis=1)

In [None]:
missing_cols = set(X_train.columns) - set(test.columns) 
for c in missing_cols: 
    test[c] = X_train[c].mean()

In [None]:
test = test[X_train.columns]

In [None]:
svc_model = SVC(kernel='rbf', probability=True) 
svc_model.fit(X, Y) 
print('Training Accuracy : ', svc_model.score(X, Y)) 
print('Validation Accuracy : ', svc_model.score(X_val, Y_val))  

In [None]:
pred = svc_model.predict(test) 
print('Test Predictions : ', pred)

# Saving model

In [None]:
import joblib
joblib.dump(svc_model, 'svc_model.pkl')

In [None]:
loaded_model = joblib.load('svc_model.pkl')

pred = loaded_model.predict(test)
print('Test Predictions : ', pred)