In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

First, loading our data and looking into it

In [None]:
data = pd.read_csv('../input/autism-screening-on-adults/autism_screening.csv')
print(data.info())
print(data.info)

I've noticed that instead of NaN there are question marks when data is missing, we will deal with that.
Also, i will drop a few columns that i consider not important from the get go and rename others, for simplicity sake

In [None]:
data.replace('?', np.nan, inplace=True)
data = data.drop(columns = ['used_app_before'])
data = data.drop(columns = ['age_desc'])
data = data.rename(columns={'Class/ASD' : 'classASD'})
data = data.rename(columns={'austim' : 'autism'})
data = data.rename(columns={'contry_of_res' : 'country_of_res'})
data.info()

Now lets take a look at results of test

In [None]:
print(data['result'].describe())
sns.displot(data['result'], bins=50, kde = False)

As we can see, all values of result are between 0 and 10. They are a sum of scores from A1 to A10

We know that this operate on data from adults. Lets take a look at their age ratio

In [None]:
print(data['age'].describe())
sns.displot(data['age'], bins=50, kde = False)

Its hard to believe that someone would be 383 years old and still alive, so i will consider it as a typo and change it to 38.
Also two records are missing and i would like change result and age data type to integer, so i will fill all blanks with first value.

In [None]:
data.loc[data.age == 383, 'age'] = 38
data['age'].fillna(data['age'].mode()[0], inplace=True)

Now, changing data types and fill all other blanks with first values of their columns.
There arent that many missing values, so it wont hurt our models that much.

In [None]:
data.age = data.age.astype(int)
data.result = data.result.astype(int)
data = data.fillna(data.mode().iloc[0])
data.info()
data.head()

As i mentioned earlier, "result" is a sum of scores from A1 to A10, so we dont really need all columns for all scores.

In [None]:
data = data.drop(columns = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', ])

I will try guessing if someone is classified on Autism Spectrum Disorder, which is our "classASD" collumn.
Lets see some data about gender, ethnicity and nationality

In [None]:
plt.figure(figsize = (15, 15))
sns.countplot(x = 'classASD', hue = 'gender', data = data)
plt.show()

It looks like women are more often classified on ASD, but not but a whole lot. Lets look at ethicities next.

In [None]:
plt.figure(figsize = (15, 15))
sns.countplot(x = 'classASD', hue = 'ethnicity', data = data)
plt.show()

It looks like White-European adults are visibly more often classified on ASD
Now lets take a look at nationality.

In [None]:
plt.figure(figsize = (30, 20))
sns.countplot(x = 'classASD', hue = 'country_of_res', data = data)

As we can see, most cases are in USA and UK. But are those two observations real coreallations or pure coincidence?
We will take a look at that now.
First, i will convert all strings to integers (unique integer for each of possible strings) using LabelEncoder.

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['ethnicity'] = le.fit_transform(data['ethnicity'])
data['jundice'] = le.fit_transform(data['jundice'])
data['autism'] = le.fit_transform(data['autism'])
data['country_of_res'] = le.fit_transform(data['country_of_res'])
data['relation'] = le.fit_transform(data['relation'])
data['classASD'] = le.fit_transform(data['classASD'])
data['gender'] = le.fit_transform(data['gender'])
data['age'] = le.fit_transform(data['age'])


lets take a look how our data looks now

In [None]:
data.head()

And now, we can create correlation matrix for all our data

In [None]:
corrMatrix = data.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

result and classASD have a really strong correlation. Othe columns aren't nearly as correlated as "result".
i shall drop all columns with weaker correlation than 0.1

In [None]:
data = data.drop(columns = ['gender', 'country_of_res', 'relation'])

Lets take a look at classASD histogram. If the proportion was not balanced enough, models could learn to always just guess one anwser, and we want something more complicated than that.

In [None]:
data.classASD.hist()

Looks good enough for me, now i will separate data for features (X) and anwser (Y)

In [None]:
X = data.drop(columns = 'classASD')
Y = data['classASD']

I will start with scaling features.

In [None]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X)  
X = scaler.transform(X)  

First model - MLPClassifier from sklearn library.

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-3, hidden_layer_sizes=(3, 2), random_state=1, max_iter=10000)
clf.fit(X, Y)
MLPClassifier(alpha=1e-3, hidden_layer_sizes=(3, 2), random_state=1, solver='adam', max_iter=10000)

alpha = 0.001, 2 layers with 3 tuples may seem a little too little, but it will be more than enough.
I will cross-validate that model 5 times.

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, Y, cv=5)

In [None]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

Thats a really good score, a little too good. Its because of such a big correlation of "result" and "classASD". Either our model is that good, or its overfitting.

Now lets try with keras Sequential model
just to be fair, same amount of layers, tuples, and same solver

In [None]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(3, input_dim = 5, activation='relu'))
model.add(Dense(3, activation = 'relu'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, Y, epochs = 200, batch_size = 10, verbose = 0)
_, accuracy = model.evaluate(X, Y)
print('Accuracy: %.4f' % (accuracy * 100))

This looks like a more propable accuracy. Now lets look how a simple Logistic Regression will solve that problem

In [None]:
from sklearn.utils import check_random_state
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

spliting test and train datasets in 8:2 ratio

In [None]:
lx = data.drop(columns = ['classASD'])
ly = data['classASD']
lx_train, lx_test, ly_train, ly_test = train_test_split(lx, ly, test_size=0.2)

In [None]:
lx_train = scaler.fit_transform(lx_train)
lx_test = scaler.fit_transform(lx_test)
logReg = LogisticRegression(C = 50. / 10000, penalty='l1', solver='liblinear', tol=0.1)
logReg.fit(lx_train, ly_train)
sparsity = np.mean(logReg.coef_ == 0) * 100
score = logReg.score(lx_test, ly_test)

print("Test score with L1 penalty: %.4f" % score)

Looks like its almost the same anwser as for keras model