># Test dataset machine learning

In [None]:
import warnings
warnings.filterwarnings("ignore")

># Coding starts

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv("datasets/autism.csv")

># EDA

In [None]:
print(data.shape)
print(data.index)
print(data.columns)
print(len(data.columns))
data.head()

In [None]:
data.info()

># Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming your DataFrame is named 'data'
# If not, replace 'data' with the name of your DataFrame

# Create a figure and subplots
fig, axs = plt.subplots(2, 2)

# Bar chart for gender distribution
gender_counts = data['Gender'].value_counts()
axs[0, 0].bar(gender_counts.index, gender_counts.values, color='blue')
axs[0, 0].set_title('Gender Distribution')
axs[0, 0].set_xlabel('Gender')
axs[0, 0].set_ylabel('Count')

# Bar chart for physical problems
physical_problems_counts = data['Fluency on regular task?'].value_counts()
axs[0, 1].bar(physical_problems_counts.index, physical_problems_counts.values, color='green')
axs[0, 1].set_title('Fluency on regular task?')
axs[0, 1].set_xlabel('Fluency on regular task?')
axs[0, 1].set_ylabel('Count')

# Histogram for Age distribution
axs[1, 0].hist(data['Age'].dropna(), bins=20, color='red')
axs[1, 0].set_title('Age Distribution')
axs[1, 0].set_xlabel('Age')
axs[1, 0].set_ylabel('Frequency')

# Bar chart for Autism diagnosis
autism_counts = data['Autism?'].value_counts()
axs[1, 1].bar(autism_counts.index, autism_counts.values, color='orange')
axs[1, 1].set_title('Autism Diagnosis')
axs[1, 1].set_xlabel('Autism')
axs[1, 1].set_ylabel('Count')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# plt.figure(figsize=(10, 8))

plt.subplot(1, 2, 1)
plt.pie(data['Gender'].value_counts(), autopct='%1.1f%%',
        labels=['Male', 'Female'])
plt.title("Gender")
plt.subplot(1, 2, 2)
plt.pie(data['Autism?'].value_counts(), autopct='%1.1f%%',
        labels=['yes', 'no'])
plt.title("Autism?")

plt.tight_layout()
plt.show()

># Preprocessing

>## Missing values

In [None]:
print("Missing values before handling:")
print(data.isnull().sum())

In [None]:
mean=data["Age"].mean()
mod=data["Understand facial expressions and body language?"].mode()[0]
print(mean)
print(mod)

In [None]:
data["Age"].fillna(mean, inplace=True)
data["Understand facial expressions and body language?"].fillna(mod, inplace=True)

In [None]:
print("Missing values after handling:")
print(data.isnull().sum())

>## Outliers handled

In [None]:
sns.boxplot(x="Age", y="Gender", data=data)

In [None]:
data["Age"].describe()

In [None]:
outliers = (data['Age'] < 7) | (data['Age'] > 20)

mean_age = data.loc[~outliers, 'Age'].mean()  # Calculate mean excluding outliers
print(mean)
data.loc[outliers, 'Age'] = mean_age

In [None]:
sns.boxplot(x="Age", y="Gender", data=data)

>## Drop unnecessary colums

In [None]:
print(data.columns)
print(len(data.columns))

In [None]:
columns_to_drop = ['Name', 'Parent', 'Address']
data = data.drop(columns=columns_to_drop, axis=1)

In [None]:
print(data.columns)
print(len(data.columns))

>## Label encoding

In [None]:
print("Gender")
print(data["Gender"].unique().tolist())
print(data["Gender"].value_counts().tolist())

print("Understand facial expressions and body language?")
print(data["Understand facial expressions and body language?"].unique().tolist())
print(data["Understand facial expressions and body language?"].value_counts().tolist())

print(data["Autism?"].unique().tolist())
print(data["Autism?"].value_counts().tolist())

print(data["Physical problem?"].unique().tolist())
print(data["Physical problem?"].value_counts().tolist())

In [None]:
label_encoder = LabelEncoder()

In [None]:
data["Gender"] = label_encoder.fit_transform(data["Gender"])
data["Autism?"] = label_encoder.fit_transform(data["Autism?"])
data["Physical problem?"] = label_encoder.fit_transform(data["Physical problem?"])

In [None]:
data.head()

In [None]:
data = pd.get_dummies(data, columns=['Understand facial expressions and body language?'], prefix='', prefix_sep='')

In [None]:
data.head()

In [None]:
data["body"] = label_encoder.fit_transform(data["body"])
data["both"] = label_encoder.fit_transform(data["both"])
data["face"] = label_encoder.fit_transform(data["face"])
data.drop(columns="none", inplace=True)

In [None]:
data.head()

In [None]:
y = data["Autism?"]  # Feature Vector
X = data.drop("Autism?",axis=1)  # Feature Matrix

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale all columns
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split


# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
import matplotlib.pyplot as plt

# Visualize the distribution of the target variable
plt.figure()
plt.subplot(1, 2, 1)
plt.hist(y_train, bins=20, color='blue', alpha=0.7)
plt.title('Distribution of y_train')
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(y_test, bins=20, color='red', alpha=0.7)
plt.title('Distribution of y_test')
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Visualize the distribution of features (choose a feature for example)
feature_index = 0  # Change this index to visualize different features
plt.figure()
plt.subplot(1, 2, 1)
plt.hist(X_train[:, feature_index], bins=20, color='blue', alpha=0.7)
plt.title('Distribution of X_train')
plt.xlabel('Feature Value')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(X_test[:, feature_index], bins=20, color='red', alpha=0.7)
plt.title('Distribution of X_test')
plt.xlabel('Feature Value')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


># Classification

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the SVM classifier
# model = SVC(kernel='linear')
model = SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import numpy as np
import matplotlib.pyplot as plt

# Function to plot ROC curves for each class
def plot_roc_curves_for_classes(model, X, y):
    # Binarize the target variable
    y_bin = label_binarize(y, classes=np.unique(y))

    # Perform cross-validated predictions for probabilities of each class
    y_scores_cv = cross_val_predict(model, X, y, cv=5, method='predict_proba')

    # Get the number of classes
    n_classes = y_bin.shape[1]

    # Initialize figure
    # plt.figure(figsize=(8, 6))

    # For each class, compute ROC curve and AUC
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_bin[:, i], y_scores_cv[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(tpr, fpr, lw=2, label='Class {} (AUC = {:.2f})'.format(i, roc_auc))

    # Plot diagonal line (random classifier)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

    # Set plot parameters
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curves for each class')
    plt.legend(loc="lower right")
    plt.show()

# Call the function to plot ROC curves for each class
plot_roc_curves_for_classes(model, X_train, y_train)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    # plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

# Plot distribution of actual vs predicted values
def plot_actual_vs_predicted(y_true, y_pred, labels):
    # plt.figure(figsize=(10, 6))
    sns.histplot(y_true, label='Actual', color='blue', kde=True, alpha=0.5)
    sns.histplot(y_pred, label='Predicted', color='orange', kde=True, alpha=0.5)
    plt.xlabel('Frequency of feeling depressed or down')
    plt.ylabel('Count')
    plt.title('Actual vs Predicted')
    plt.legend()
    plt.xticks(rotation=45)
    plt.xticks(labels)
    plt.show()

# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred, labels=data["Autism?"].unique().tolist())

# Plot distribution of actual vs predicted values
plot_actual_vs_predicted(y_test, y_pred, labels=data["Autism?"].unique().tolist())