## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model

sns.set_style("darkgrid")
pd.set_option("display.max_columns", None) # setting to display all columns
pd.options.plotting.backend = "plotly"

## Reading Data

In [None]:
df = pd.read_csv("Autism_Data.csv")
df.head()

## Features Description

|Feature | Description
|------|------------
| index | The participant’s ID number
|AX_Score|Score based on the Autism Spectrum Quotient (AQ) 10 item screening tool [AQ-10](https://docs.autismresearchcentre.com/tests/AQ10.pdf)
|age|Age in years
|gender| Male or Female
|ethnicity|Ethnicities in text form
|jaundice|Whether or not the participant was born with jaundice?
|autism|Whether or not anyone in tbe immediate family has been diagnosed with autism?
|country_of_res|Countries in text format
|used_app_before|Whether the participant has used a screening app
|result|Score from the AQ-10 screening tool
|age_desc|Age as categorical
|relation|Relation of person who completed the test
|Class/ASD|Participant classification


## Meta information of dataframe

In [None]:
print(f'Shape of dataframe is: {df.shape}')

In [None]:
pd.DataFrame(df.dtypes, columns = ["Data Type"])

## Statistical information of data

In [None]:
df.describe()

## Checking for NaN values

In [None]:
pd.DataFrame(df.isnull().sum(), columns=["Missing Values"]).style.bar(color = "#84A9AC")

**Only `age` feature has 2 missing values**

In [None]:
print(f"Maximum age is data: {df['age'].max()}\n")

print(f"Minimum age is data: {df['age'].min()}")

**383 age is outlier so dropping it**

**Record which has maximum value in Age**

In [None]:
df[df['age'] == df['age'].max()]

In [None]:
# dropping record number 52
df.drop(index = 52, inplace = True)

# resetting index
df.reset_index(inplace = True)

## Imputing NaN values in age with mean of age

In [None]:
df.replace('?', np.nan, inplace=True)
df['age'] = df['age'].astype(float)

df['age'] = df['age'].fillna(np.round(df['age'].mean(), 0))
df['age'] = df['age'].astype(int)

In [None]:
pd.DataFrame(df.isnull().sum(), columns=["Missing Values"])

**Missing values are now imputed**

### Checking unique values in Categorical feature

In [None]:
for col in df.select_dtypes('O').columns:
    print("-------------------------------")
    print(f'Column name: {col}\n')
    print(f'Unique values:\n{df[col].unique()}\n\n')

**`ethnicity` and `relation` contains some invalid values**

### Replacing invalid value in ethnicity

In [None]:
df['ethnicity'] = df['ethnicity'].replace('?', 'Others')

**In `ethnicity` feature, "Others" and "others" are same thing but it is considered different because one is in small case and other is in letter case. So fixing this.**

In [None]:
df['ethnicity'] = df['ethnicity'].replace('others', 'Others')

**Checking unique values in `ethnicity` after fixing problems**

In [None]:
df['ethnicity'].unique()

### Replacing invalid value in relation

**Replacing `?` in `relation` with mode of relation**

In [None]:
df['relation'] = df['relation'].replace('?', df['relation'].mode()[0])

**Checking unique values in `relation` after fixing problems**

In [None]:
df['relation'].unique()

## Visualization

### Counts of Males to Females in dataset

In [None]:
fig = px.histogram(df, x="gender",
                   template='plotly_dark',
                   color_discrete_sequence = ["#84A9AC"])

fig.update_layout(title = "<b>Counts of Male and Female</b>",
                  title_x = 0.5,
                  title_font = dict(size = 20),
                  uniformtext_minsize = 15)

fig.show()

In [None]:
fig = px.pie(df, names = "gender",
             title = "<b>Counts of Male and Female</b>",
             hole = 0.5, template = "plotly_dark")

fig.update_traces(textposition='inside',
                  textinfo='percent+label',
                  marker=dict(line=dict(color='#000000', width = 1.5)))

fig.update_layout(title_x = 0.5,
                  title_font = dict(size = 20),
                  uniformtext_minsize = 15)

fig.show()

### Autism Spectrum Disorder Counts

In [None]:
df['Class/ASD'].value_counts()

In [None]:
fig = px.pie(df, names = "Class/ASD",
             title = "<b>Autism Spectrum Disorder Counts</b>",
             template = "plotly_dark")

fig.update_traces(textposition='inside',
                  textinfo = 'percent+label',
                  marker = dict(line = dict(color = '#000000', width = 1.5)))

fig.update_layout(title_x = 0.5,
                  title_font = dict(size = 20),
                  uniformtext_minsize = 15)


fig.show()

**27% people are ASD patient in dataset**

### Average age of males and females

In [None]:
fig = df.groupby('gender').agg({'age':'mean'}).plot(kind='bar',
                                                    template = "seaborn",
                                                    labels = {"gender":"Gender",
                                                              "value":"Age"},
                                                    color_discrete_sequence = ["#84b1b5"]);

fig.update_layout(title = "<b>Average Age</b>\n",
                  title_font = dict(size = 20),)


fig.show()

### Gender Counts of ASD Patients

In [None]:
fig = df[df['Class/ASD'] == "YES"]['gender'].value_counts().plot(kind = 'bar',
                                                                 template = "seaborn",
                                                                 color_discrete_sequence = ["#84b1b5"],
                                                                 labels = {"index":"Gender",
                                                                          "value":"Counts"});

fig.update_layout(title = "<b>Gender Count of ASD Patients</b>\n",
                  title_font = dict(size = 20), width = 900)

fig.show()

### Counts of ASD Patients Country Wise

In [None]:
asd_patients_country_wise = pd.DataFrame(df[df['Class/ASD'] == "YES"]['contry_of_res'].value_counts()).rename({"contry_of_res":"ASD_Patient_Counts"}, axis = 1)

In [None]:
asd_patients_country_wise.style.bar(color="#84A9AC")

In [None]:
asd_patients_country_wise.index

In [None]:
fig = px.bar(data_frame = asd_patients_country_wise,
             x = asd_patients_country_wise.index,
             y = "ASD_Patient_Counts",
             labels = {"index" : "Country"},
             color_discrete_sequence = px.colors.qualitative.D3_r,
             template='plotly_dark')

fig.update_xaxes(tickangle = 310)

fig.update_layout(title={
        'text': "<b>Counts of ASD Patients Country Wise</b>",
        'y':0.93,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()


In [None]:
label = asd_patients_country_wise.index
value = asd_patients_country_wise['ASD_Patient_Counts']

fig = go.Figure(data=[go.Pie(labels = label,
                             values = value,
                             rotation = 90)])

fig.update_traces(textposition = 'inside',
                  textinfo = 'percent+label',
                  marker = dict(line = dict(color = '#000000', width = 1.5)))

fig.update_layout(title_text='<b>Country Wise Counts of ASD Patients</b>',
                  title_x = 0.5,
                  title_font = dict(size = 20),
                  uniformtext_minsize = 15,
                  template='plotly_dark')

fig.show()

## Preprocessing data for Classification model

### Dropping Unwanted columns

In [None]:
df.drop(['index','age_desc', "contry_of_res"], axis = 1, inplace = True)

### Splitting data in X and Y

In [None]:
X = df.drop("Class/ASD", axis = 1)# select all other feature except "Class/ASD" for training
Y = df['Class/ASD']

### OneHotEncoding of Categorical features

In [51]:
X = pd.get_dummies(X)
Y = pd.get_dummies(Y)

### Splitting data in train and test

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)

In [None]:
print(f"Shape of X_train is: {X_train.shape}")
print(f"Shape of Y_train is: {Y_train.shape}\n")
print(f"Shape of X_test is: {X_test.shape}")
print(f"Shape of Y_test is: {Y_test.shape}")

## Creating ANN Model

In [None]:
input_dim = X.shape[1]

In [None]:
model = Sequential()
model.add(Dense(8, input_dim = input_dim, kernel_initializer='normal', activation='relu'))
model.add(Dense(5, activation = "relu", kernel_initializer='normal'))
model.add(Dense(2, activation = 'sigmoid'))

### Compiling Model

In [None]:
# compiling model
model.compile(optimizer = Adam(learning_rate = 0.001),
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

### Checking summary of Model

In [None]:
model.summary()

### Visualizing Model Layers

In [None]:
plot_model(model,
           show_dtype = True,
           show_layer_names = True,
           show_shapes = True)

### Training Model

In [None]:
result = model.fit(X_train, Y_train, epochs = 20, batch_size = 10)

## Visualize Model Accuracy & Loss

In [None]:
acc = result.history['accuracy']
loss = result.history['loss']

epoch = [i + 1 for i in range(len(acc))]

In [None]:
acc_loss_df = pd.DataFrame({"Accuracy" : acc,
                            "Loss" : loss,
                            "Epoch" : epoch})

acc_loss_df.style.bar(color = '#84A9AC',
                      subset = ['Accuracy','Loss'])

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = acc_loss_df['Epoch'],
                         y = acc_loss_df['Accuracy'],
                         mode='lines+markers',
                         name='Accuracy'))

fig.add_trace(go.Scatter(x = acc_loss_df['Epoch'],
                         y = acc_loss_df['Loss'],
                         mode='lines+markers',
                         name = 'Loss'))

fig.update_layout(title = {'text': "<b>Training Accuracy Vs Training Loss</b>\n",
                           'xanchor': 'center',
                           'yanchor': 'top',
                           'y':0.9,'x':0.5,},
                  xaxis_title="Epoch",
                  yaxis_title = "Accuracy / Loss",
                  title_font = dict(size = 20))

fig.layout.template = 'plotly_dark'

fig.show()

## Evaluating Model

In [None]:
loss, acc = model.evaluate(X_test, Y_test)

In [None]:
print(f"Accuracy on unseen data is: { np.round(acc, 2) }")
print(f'Loss on unseen data is: { np.round(loss, 2) }')

## Classification Report

In [None]:
prediction = model.predict(X_test)
prediction = np.argmax(prediction, axis = 1)

In [None]:
print(accuracy_score(Y_test[['YES']], prediction))

In [None]:
print(classification_report(Y_test[['YES']], prediction))