# Import data 

Link from UCI repository. You can find the "Autism Screening Adult Data Set" from the following link: https://archive.ics.uci.edu/ml/datasets/Autism%2BScreening%2BAdult. We will be using this data set.

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

In [42]:
filename = "csv_result-Autism-Adult-Data.csv"

In [43]:
df = pd.read_csv(filename)
df.head(10)

Unnamed: 0,id,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,1,0,0,1,1,0,...,f,White-European,no,no,United States,no,6,18 and more,Self,NO
1,2,1,1,0,1,0,0,0,1,0,...,m,Latino,no,yes,Brazil,no,5,18 and more,Self,NO
2,3,1,1,0,1,1,0,1,1,1,...,m,Latino,yes,yes,Spain,no,8,18 and more,Parent,YES
3,4,1,1,0,1,0,0,1,1,0,...,f,White-European,no,yes,United States,no,6,18 and more,Self,NO
4,5,1,0,0,0,0,0,0,1,0,...,f,?,no,no,Egypt,no,2,18 and more,?,NO
5,6,1,1,1,1,1,0,1,1,1,...,m,Others,yes,no,United States,no,9,18 and more,Self,YES
6,7,0,1,0,0,0,0,0,1,0,...,f,Black,no,no,United States,no,2,18 and more,Self,NO
7,8,1,1,1,1,0,0,0,0,1,...,m,White-European,no,no,New Zealand,no,5,18 and more,Parent,NO
8,9,1,1,0,0,1,0,0,1,1,...,m,White-European,no,no,United States,no,6,18 and more,Self,NO
9,10,1,1,1,1,0,1,1,1,1,...,m,Asian,yes,yes,Bahamas,no,8,18 and more,Health care professional,YES


## Identify and handle missing values


## Identify missing values

### Convert "?" to NaN

In this dataset, missing data comes with the question mark "?". We replace "?" with NaN (Not a Number), which is Python's default missing value marker, for reasons of computational speed and convenience.

In [44]:
import numpy as np

# replace "?" to NaN
df.replace("?", np.nan, inplace = True)
df.head(10)

Unnamed: 0,id,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,1,0,0,1,1,0,...,f,White-European,no,no,United States,no,6,18 and more,Self,NO
1,2,1,1,0,1,0,0,0,1,0,...,m,Latino,no,yes,Brazil,no,5,18 and more,Self,NO
2,3,1,1,0,1,1,0,1,1,1,...,m,Latino,yes,yes,Spain,no,8,18 and more,Parent,YES
3,4,1,1,0,1,0,0,1,1,0,...,f,White-European,no,yes,United States,no,6,18 and more,Self,NO
4,5,1,0,0,0,0,0,0,1,0,...,f,,no,no,Egypt,no,2,18 and more,,NO
5,6,1,1,1,1,1,0,1,1,1,...,m,Others,yes,no,United States,no,9,18 and more,Self,YES
6,7,0,1,0,0,0,0,0,1,0,...,f,Black,no,no,United States,no,2,18 and more,Self,NO
7,8,1,1,1,1,0,0,0,0,1,...,m,White-European,no,no,New Zealand,no,5,18 and more,Parent,NO
8,9,1,1,0,0,1,0,0,1,1,...,m,White-European,no,no,United States,no,6,18 and more,Self,NO
9,10,1,1,1,1,0,1,1,1,1,...,m,Asian,yes,yes,Bahamas,no,8,18 and more,Health care professional,YES


In [45]:
missing_data = df.isnull()
missing_data.head()

Unnamed: 0,id,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False


### Count missing values in each column

In [46]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")    

id
False    704
Name: id, dtype: int64

A1_Score
False    704
Name: A1_Score, dtype: int64

A2_Score
False    704
Name: A2_Score, dtype: int64

A3_Score
False    704
Name: A3_Score, dtype: int64

A4_Score
False    704
Name: A4_Score, dtype: int64

A5_Score
False    704
Name: A5_Score, dtype: int64

A6_Score
False    704
Name: A6_Score, dtype: int64

A7_Score
False    704
Name: A7_Score, dtype: int64

A8_Score
False    704
Name: A8_Score, dtype: int64

A9_Score
False    704
Name: A9_Score, dtype: int64

A10_Score
False    704
Name: A10_Score, dtype: int64

age
False    702
True       2
Name: age, dtype: int64

gender
False    704
Name: gender, dtype: int64

ethnicity
False    609
True      95
Name: ethnicity, dtype: int64

jundice
False    704
Name: jundice, dtype: int64

austim
False    704
Name: austim, dtype: int64

contry_of_res
False    704
Name: contry_of_res, dtype: int64

used_app_before
False    704
Name: used_app_before, dtype: int64

result
False    704
Name: result, dtype: i

### Replace missing values

Based on the summary above, each column has 704 rows of data, 3 columns containing missing data:

Relation: 95 missing data

age: 2 missing data

Ethnicity: 95 missing data

Since they are categorical we replace them with the most occuring value in the column.

In [47]:
df = df.fillna(df.mode().iloc[0])
df.head()

Unnamed: 0,id,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,1,0,0,1,1,0,...,f,White-European,no,no,United States,no,6,18 and more,Self,NO
1,2,1,1,0,1,0,0,0,1,0,...,m,Latino,no,yes,Brazil,no,5,18 and more,Self,NO
2,3,1,1,0,1,1,0,1,1,1,...,m,Latino,yes,yes,Spain,no,8,18 and more,Parent,YES
3,4,1,1,0,1,0,0,1,1,0,...,f,White-European,no,yes,United States,no,6,18 and more,Self,NO
4,5,1,0,0,0,0,0,0,1,0,...,f,White-European,no,no,Egypt,no,2,18 and more,Self,NO


## Removing unnecessary attributes

The attribute named “Result” is the sum total of the scores of questions (A1-A10). The value of “age_desc” is ’18 and more’ in all the samples. These two attributes are unnecessary. 

In [48]:
df = df.drop(['result','age_desc'], axis = 1)
df.head()

Unnamed: 0,id,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,relation,Class/ASD
0,1,1,1,1,1,0,0,1,1,0,0,26,f,White-European,no,no,United States,no,Self,NO
1,2,1,1,0,1,0,0,0,1,0,1,24,m,Latino,no,yes,Brazil,no,Self,NO
2,3,1,1,0,1,1,0,1,1,1,1,27,m,Latino,yes,yes,Spain,no,Parent,YES
3,4,1,1,0,1,0,0,1,1,0,1,35,f,White-European,no,yes,United States,no,Self,NO
4,5,1,0,0,0,0,0,0,1,0,0,40,f,White-European,no,no,Egypt,no,Self,NO



## Rename column names

In [81]:
df.rename(columns={'jundice':'jaundice'},inplace = True)
df.rename(columns={'contry_of_res':'country_of_res'},inplace = True)
df.rename(columns={'austim':'autism'},inplace = True)
df

Unnamed: 0,id,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,autism,country_of_res,used_app_before,relation,Class/ASD
0,1,1,1,1,1,0,0,1,1,0,0,26,f,White-European,no,no,United States,no,Self,NO
1,2,1,1,0,1,0,0,0,1,0,1,24,m,Latino,no,yes,Brazil,no,Self,NO
2,3,1,1,0,1,1,0,1,1,1,1,27,m,Latino,yes,yes,Spain,no,Parent,YES
3,4,1,1,0,1,0,0,1,1,0,1,35,f,White-European,no,yes,United States,no,Self,NO
4,5,1,0,0,0,0,0,0,1,0,0,40,f,White-European,no,no,Egypt,no,Self,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,700,0,1,0,1,1,0,1,1,1,1,25,f,White-European,no,no,Russia,no,Self,YES
700,701,1,0,0,0,0,0,0,1,0,1,34,m,Hispanic,no,no,Mexico,no,Parent,NO
701,702,1,0,1,1,1,0,1,1,0,1,24,f,White-European,no,no,Russia,no,Self,YES
702,703,1,0,0,1,1,0,1,0,1,1,35,m,South Asian,no,no,Pakistan,no,Self,NO


### Removing non numeric data

In [89]:
df.drop_duplicates(inplace = True)
df_num = df.drop(['gender','ethnicity','jaundice','autism','country_of_res','used_app_before','relation','Class/ASD'], axis = 'columns')
df_num

Unnamed: 0,id,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age
0,1,1,1,1,1,0,0,1,1,0,0,26
1,2,1,1,0,1,0,0,0,1,0,1,24
2,3,1,1,0,1,1,0,1,1,1,1,27
3,4,1,1,0,1,0,0,1,1,0,1,35
4,5,1,0,0,0,0,0,0,1,0,0,40
...,...,...,...,...,...,...,...,...,...,...,...,...
699,700,0,1,0,1,1,0,1,1,1,1,25
700,701,1,0,0,0,0,0,0,1,0,1,34
701,702,1,0,1,1,1,0,1,1,0,1,24
702,703,1,0,0,1,1,0,1,0,1,1,35


## Encoding categorical data.


Relation, Jaundice, Gender, Country_of_res, Ethnicity, Autism , used_app_before, attributes have categorical data.

One hot encoding is used for this purpose since they are nominal variables.

Nominal variable - comprises a finite set of discrete values with no relationship between values.
Ordinal Variable - Variable comprises a finite set of discrete values with a ranked ordering between values.

One hot endcoding is used because in most of the models numeric data is required.


In [83]:
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

In [97]:
subset =  df[['jaundice','gender','country_of_res','ethnicity','autism','used_app_before','relation','Class/ASD']]
ce_one_hot = ce.OneHotEncoder(cols = ['jaundice','gender','country_of_res','ethnicity','autism','used_app_before','relation','Class/ASD'])
autism_onehot=ce_one_hot.fit_transform(subset)
autism_onehot

Unnamed: 0,jaundice_1,jaundice_2,gender_1,gender_2,country_of_res_1,country_of_res_2,country_of_res_3,country_of_res_4,country_of_res_5,country_of_res_6,...,autism_2,used_app_before_1,used_app_before_2,relation_1,relation_2,relation_3,relation_4,relation_5,Class/ASD_1,Class/ASD_2
0,1,0,1,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
1,1,0,0,1,0,1,0,0,0,0,...,1,1,0,1,0,0,0,0,1,0
2,0,1,0,1,0,0,1,0,0,0,...,1,1,0,0,1,0,0,0,0,1
3,1,0,1,0,1,0,0,0,0,0,...,1,1,0,1,0,0,0,0,1,0
4,1,0,1,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,1,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
700,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
701,1,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
702,1,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


### Creating target attribute (numerical)

In [98]:
final_df = pd.concat([df_num, autism_onehot], axis='columns')
target = final_df[['Class/ASD_1']]
final_df.drop(['Class/ASD_1','Class/ASD_2'],axis = "columns", inplace = True)
final_df.head()

Unnamed: 0,id,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,ethnicity_11,autism_1,autism_2,used_app_before_1,used_app_before_2,relation_1,relation_2,relation_3,relation_4,relation_5
0,1,1,1,1,1,0,0,1,1,0,...,0,1,0,1,0,1,0,0,0,0
1,2,1,1,0,1,0,0,0,1,0,...,0,0,1,1,0,1,0,0,0,0
2,3,1,1,0,1,1,0,1,1,1,...,0,0,1,1,0,0,1,0,0,0
3,4,1,1,0,1,0,0,1,1,0,...,0,0,1,1,0,1,0,0,0,0
4,5,1,0,0,0,0,0,0,1,0,...,0,1,0,1,0,1,0,0,0,0


###  Remove recurrent values

In [99]:
final_df = final_df.drop(['jaundice_2','gender_2','autism_2','used_app_before_2'],axis = 'columns')
final_df

Unnamed: 0,id,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,ethnicity_9,ethnicity_10,ethnicity_11,autism_1,used_app_before_1,relation_1,relation_2,relation_3,relation_4,relation_5
0,1,1,1,1,1,0,0,1,1,0,...,0,0,0,1,1,1,0,0,0,0
1,2,1,1,0,1,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0
2,3,1,1,0,1,1,0,1,1,1,...,0,0,0,0,1,0,1,0,0,0
3,4,1,1,0,1,0,0,1,1,0,...,0,0,0,0,1,1,0,0,0,0
4,5,1,0,0,0,0,0,0,1,0,...,0,0,0,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,700,0,1,0,1,1,0,1,1,1,...,0,0,0,1,1,1,0,0,0,0
700,701,1,0,0,0,0,0,0,1,0,...,1,0,0,1,1,0,1,0,0,0
701,702,1,0,1,1,1,0,1,1,0,...,0,0,0,1,1,1,0,0,0,0
702,703,1,0,0,1,1,0,1,0,1,...,0,0,0,1,1,1,0,0,0,0









# Random forest

In [135]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


x_train, x_test, y_train, y_test = train_test_split(final_df,target,test_size=0.2)

In [136]:
len(y_train)

563

In [137]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100)
model.fit(x_train,y_train)

  model.fit(x_train,y_train)


RandomForestClassifier()

In [138]:
model.score(x_test, y_test)

0.9645390070921985

In [140]:
y_predicted = model.predict(x_test)

## Evaluatig confusion matrix

In [143]:
cm = confusion_matrix(y_test,y_predicted)
cm

array([[ 35,   4],
       [  1, 101]], dtype=int64)

In [144]:
x_predicted = model.predict(x_train)
cm1 = confusion_matrix(y_train,x_predicted)

In [145]:
cm1

array([[150,   0],
       [  0, 413]], dtype=int64)