# **Predict Autism Spectrum Disorder in Child**

![pic1](signs_of_autism.jpg)
![pic2](signs_of_autism1.jpg)

https://www.kaggle.com/fabdelja/autism-screening-for-toddlers

#### **Import the required packages**

In [1]:
import numpy as np
import pandas as pd
import scipy.io.arff as arff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm

%matplotlib inline

#### **Import the ARFF(Attribute Relational File Format) Dataset file**

In [2]:
arff.loadarff('Autism-Child-Data.arff')[0]

array([(b'1', b'1', b'0', b'0', b'1', b'1', b'0', b'1', b'0', b'0',  6., b'm', b'Others', b'no', b'no', b'Jordan', b'no',  5., b'4-11 years', b'Parent', b'NO'),
       (b'1', b'1', b'0', b'0', b'1', b'1', b'0', b'1', b'0', b'0',  6., b'm', b'Middle Eastern ', b'no', b'no', b'Jordan', b'no',  5., b'4-11 years', b'Parent', b'NO'),
       (b'1', b'1', b'0', b'0', b'0', b'1', b'1', b'1', b'0', b'0',  6., b'm', b'?', b'no', b'no', b'Jordan', b'yes',  5., b'4-11 years', b'?', b'NO'),
       (b'0', b'1', b'0', b'0', b'1', b'1', b'0', b'0', b'0', b'1',  5., b'f', b'?', b'yes', b'no', b'Jordan', b'no',  4., b'4-11 years', b'?', b'NO'),
       (b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1',  5., b'm', b'Others', b'yes', b'no', b'United States', b'no', 10., b'4-11 years', b'Parent', b'YES'),
       (b'0', b'0', b'1', b'0', b'1', b'1', b'0', b'1', b'0', b'1',  4., b'm', b'?', b'no', b'yes', b'Egypt', b'no',  5., b'4-11 years', b'?', b'NO'),
       (b'1', b'0', b'1', b'1', b'1', b'1', 

In [3]:
arff.loadarff('Autism-Child-Data.arff')[1]

Dataset: child
	A1_Score's type is nominal, range is ('0', '1')
	A2_Score's type is nominal, range is ('0', '1')
	A3_Score's type is nominal, range is ('0', '1')
	A4_Score's type is nominal, range is ('0', '1')
	A5_Score's type is nominal, range is ('0', '1')
	A6_Score's type is nominal, range is ('0', '1')
	A7_Score's type is nominal, range is ('0', '1')
	A8_Score's type is nominal, range is ('0', '1')
	A9_Score's type is nominal, range is ('0', '1')
	A10_Score's type is nominal, range is ('0', '1')
	age's type is numeric
	gender's type is nominal, range is ('m', 'f')
	ethnicity's type is nominal, range is ('Others', 'Middle Eastern ', 'White-European', 'Black', 'South Asian', 'Asian', 'Pasifika', 'Hispanic', 'Turkish', 'Latino')
	jundice's type is nominal, range is ('no', 'yes')
	austim's type is nominal, range is ('no', 'yes')
	contry_of_res's type is nominal, range is ('Jordan', 'United States', 'Egypt', 'United Kingdom', 'Bahrain', 'Austria', 'Kuwait', 'United Arab Emirates', 'Eur

#### **Creating DataFrame from ARFF file**

In [4]:
autism_dataset = pd.DataFrame(arff.loadarff('Autism-Child-Data.arff')[0])

In [5]:
autism_dataset.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'1',b'0',b'0',...,b'm',b'Others',b'no',b'no',b'Jordan',b'no',5.0,b'4-11 years',b'Parent',b'NO'
1,b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'1',b'0',b'0',...,b'm',b'Middle Eastern ',b'no',b'no',b'Jordan',b'no',5.0,b'4-11 years',b'Parent',b'NO'
2,b'1',b'1',b'0',b'0',b'0',b'1',b'1',b'1',b'0',b'0',...,b'm',b'?',b'no',b'no',b'Jordan',b'yes',5.0,b'4-11 years',b'?',b'NO'
3,b'0',b'1',b'0',b'0',b'1',b'1',b'0',b'0',b'0',b'1',...,b'f',b'?',b'yes',b'no',b'Jordan',b'no',4.0,b'4-11 years',b'?',b'NO'
4,b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',...,b'm',b'Others',b'yes',b'no',b'United States',b'no',10.0,b'4-11 years',b'Parent',b'YES'


#### **Changing the character encoding**

In [6]:
def apply_decode(df_name):
    for col in df_name.columns:
        if df_name[col].dtype != 'float64':
            df_name[col] = df_name[col].apply(lambda val : val.decode('utf-8'))
    pd.set_option('display.max_columns',50)
    return df_name

In [7]:
autism_df = apply_decode(autism_dataset)

In [8]:
autism_df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,6.0,m,Others,no,no,Jordan,no,5.0,4-11 years,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,6.0,m,Middle Eastern,no,no,Jordan,no,5.0,4-11 years,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,6.0,m,?,no,no,Jordan,yes,5.0,4-11 years,?,NO
3,0,1,0,0,1,1,0,0,0,1,5.0,f,?,yes,no,Jordan,no,4.0,4-11 years,?,NO
4,1,1,1,1,1,1,1,1,1,1,5.0,m,Others,yes,no,United States,no,10.0,4-11 years,Parent,YES


#### **Dataset Features**

In [9]:
autism_df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before',
       'result', 'age_desc', 'relation', 'Class/ASD'],
      dtype='object')

In [10]:
len(autism_df.columns)

21

In [11]:
autism_df = autism_df[['age','gender','ethnicity','contry_of_res','jundice','austim','relation','used_app_before','age_desc','A1_Score','A2_Score','A3_Score','A4_Score','A5_Score',
                       'A6_Score','A7_Score','A8_Score','A9_Score','A10_Score','result','Class/ASD']]

In [12]:
autism_df.head()

Unnamed: 0,age,gender,ethnicity,contry_of_res,jundice,austim,relation,used_app_before,age_desc,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,result,Class/ASD
0,6.0,m,Others,Jordan,no,no,Parent,no,4-11 years,1,1,0,0,1,1,0,1,0,0,5.0,NO
1,6.0,m,Middle Eastern,Jordan,no,no,Parent,no,4-11 years,1,1,0,0,1,1,0,1,0,0,5.0,NO
2,6.0,m,?,Jordan,no,no,?,yes,4-11 years,1,1,0,0,0,1,1,1,0,0,5.0,NO
3,5.0,f,?,Jordan,yes,no,?,no,4-11 years,0,1,0,0,1,1,0,0,0,1,4.0,NO
4,5.0,m,Others,United States,yes,no,Parent,no,4-11 years,1,1,1,1,1,1,1,1,1,1,10.0,YES


In [13]:
autism_df.columns = ['age','gender','ethnicity','country','born_with_jaundice','family_member_with_PDD','whos_completing_test','used_screening_app_before','age_type','Q1_Score',
                     'Q2_Score','Q3_Score','Q4_Score','Q5_Score','Q6_Score','Q7_Score','Q8_Score','Q9_Score','Q10_Score','screening_score','ASD_Label']

In [14]:
autism_df.head()

Unnamed: 0,age,gender,ethnicity,country,born_with_jaundice,family_member_with_PDD,whos_completing_test,used_screening_app_before,age_type,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Q5_Score,Q6_Score,Q7_Score,Q8_Score,Q9_Score,Q10_Score,screening_score,ASD_Label
0,6.0,m,Others,Jordan,no,no,Parent,no,4-11 years,1,1,0,0,1,1,0,1,0,0,5.0,NO
1,6.0,m,Middle Eastern,Jordan,no,no,Parent,no,4-11 years,1,1,0,0,1,1,0,1,0,0,5.0,NO
2,6.0,m,?,Jordan,no,no,?,yes,4-11 years,1,1,0,0,0,1,1,1,0,0,5.0,NO
3,5.0,f,?,Jordan,yes,no,?,no,4-11 years,0,1,0,0,1,1,0,0,0,1,4.0,NO
4,5.0,m,Others,United States,yes,no,Parent,no,4-11 years,1,1,1,1,1,1,1,1,1,1,10.0,YES


## **Datatype Handling**

In [15]:
autism_df.dtypes

age                          float64
gender                        object
ethnicity                     object
country                       object
born_with_jaundice            object
family_member_with_PDD        object
whos_completing_test          object
used_screening_app_before     object
age_type                      object
Q1_Score                      object
Q2_Score                      object
Q3_Score                      object
Q4_Score                      object
Q5_Score                      object
Q6_Score                      object
Q7_Score                      object
Q8_Score                      object
Q9_Score                      object
Q10_Score                     object
screening_score              float64
ASD_Label                     object
dtype: object

#### **NULL Records**

In [16]:
autism_df.isnull().sum()

age                          4
gender                       0
ethnicity                    0
country                      0
born_with_jaundice           0
family_member_with_PDD       0
whos_completing_test         0
used_screening_app_before    0
age_type                     0
Q1_Score                     0
Q2_Score                     0
Q3_Score                     0
Q4_Score                     0
Q5_Score                     0
Q6_Score                     0
Q7_Score                     0
Q8_Score                     0
Q9_Score                     0
Q10_Score                    0
screening_score              0
ASD_Label                    0
dtype: int64

## ***AGE variable***

In [17]:
autism_df[autism_df['age'].isnull()]

Unnamed: 0,age,gender,ethnicity,country,born_with_jaundice,family_member_with_PDD,whos_completing_test,used_screening_app_before,age_type,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Q5_Score,Q6_Score,Q7_Score,Q8_Score,Q9_Score,Q10_Score,screening_score,ASD_Label
32,,m,?,Egypt,no,no,?,no,4-11 years,1,0,0,1,0,1,1,1,1,1,7.0,YES
65,,m,Middle Eastern,Jordan,no,no,Parent,no,4-11 years,0,0,1,0,0,0,0,0,0,0,1.0,NO
126,,m,?,Jordan,yes,no,?,no,4-11 years,1,1,1,1,0,1,0,1,0,1,7.0,YES
138,,m,?,Qatar,yes,no,?,yes,4-11 years,0,1,1,0,1,1,1,1,0,0,6.0,NO


#### **So, before converting the datatype of AGE variable from FLOAT to INT, we need to fill its NULL values. Hence, replacing the NULLs with 0 and later on handle these 4 records.** 

In [18]:
autism_df['age'] = autism_df['age'].fillna(value=0)

In [19]:
autism_df.isnull().sum()

age                          0
gender                       0
ethnicity                    0
country                      0
born_with_jaundice           0
family_member_with_PDD       0
whos_completing_test         0
used_screening_app_before    0
age_type                     0
Q1_Score                     0
Q2_Score                     0
Q3_Score                     0
Q4_Score                     0
Q5_Score                     0
Q6_Score                     0
Q7_Score                     0
Q8_Score                     0
Q9_Score                     0
Q10_Score                    0
screening_score              0
ASD_Label                    0
dtype: int64

In [20]:
autism_df[autism_df['age'] == 0]

Unnamed: 0,age,gender,ethnicity,country,born_with_jaundice,family_member_with_PDD,whos_completing_test,used_screening_app_before,age_type,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Q5_Score,Q6_Score,Q7_Score,Q8_Score,Q9_Score,Q10_Score,screening_score,ASD_Label
32,0.0,m,?,Egypt,no,no,?,no,4-11 years,1,0,0,1,0,1,1,1,1,1,7.0,YES
65,0.0,m,Middle Eastern,Jordan,no,no,Parent,no,4-11 years,0,0,1,0,0,0,0,0,0,0,1.0,NO
126,0.0,m,?,Jordan,yes,no,?,no,4-11 years,1,1,1,1,0,1,0,1,0,1,7.0,YES
138,0.0,m,?,Qatar,yes,no,?,yes,4-11 years,0,1,1,0,1,1,1,1,0,0,6.0,NO


In [21]:
autism_df['age'] = autism_df['age'].astype('int')

In [22]:
autism_df['age'].head()

0    6
1    6
2    6
3    5
4    5
Name: age, dtype: int32

In [23]:
autism_df['age'].dtype

dtype('int32')

## ***GENDER variable***

In [24]:
autism_df['gender'].value_counts()

m    208
f     84
Name: gender, dtype: int64

#### **So, assigning 1 to m(i.e. male) and 0 to f(i.e. female)**

In [25]:
autism_df['gender'] = autism_df['gender'].apply(lambda val : 1 if val == 'm' else 0 if val == 'f' else val)

In [26]:
autism_df['gender'].dtype

dtype('int64')

In [27]:
autism_df['gender'].value_counts()

1    208
0     84
Name: gender, dtype: int64

## ***BORN_WITH_JAUNDICE variable***

In [28]:
autism_df['born_with_jaundice'].value_counts()

no     212
yes     80
Name: born_with_jaundice, dtype: int64

In [29]:
autism_df['born_with_jaundice'].dtype

dtype('O')

#### **So, assigning 1 to yes and 0 to no**

In [30]:
autism_df['born_with_jaundice'] = autism_df['born_with_jaundice'].apply(lambda val: 1 if val =='yes' else 0)

In [31]:
autism_df['born_with_jaundice'].value_counts()

0    212
1     80
Name: born_with_jaundice, dtype: int64

In [32]:
autism_df['born_with_jaundice'].dtype

dtype('int64')

## ***FAMILY_MEMBER_WITH_PDD variable***
#### **PDD stands for Pervasive Development Disorder**

In [33]:
autism_df['family_member_with_PDD'].value_counts()

no     243
yes     49
Name: family_member_with_PDD, dtype: int64

In [34]:
autism_df['family_member_with_PDD'].dtype

dtype('O')

In [35]:
autism_df['family_member_with_PDD'] = autism_df['family_member_with_PDD'].apply(lambda val: 1 if val=='yes' else 0)

In [36]:
autism_df['family_member_with_PDD'].dtype

dtype('int64')

In [37]:
autism_df['family_member_with_PDD'].value_counts()

0    243
1     49
Name: family_member_with_PDD, dtype: int64

## ***USED_SCREENING_APP_BEFORE variable***

In [38]:
autism_df['used_screening_app_before'].dtype

dtype('O')

In [39]:
autism_df['used_screening_app_before'].value_counts()

no     281
yes     11
Name: used_screening_app_before, dtype: int64

In [40]:
autism_df['used_screening_app_before'] = autism_df['used_screening_app_before'].apply(lambda val: 1 if val =='yes' else 0)

In [41]:
autism_df['used_screening_app_before'].dtype

dtype('int64')

In [42]:
autism_df['used_screening_app_before'].value_counts()

0    281
1     11
Name: used_screening_app_before, dtype: int64

## ***Screening Questions variables***

In [43]:
def qscore_vars(df_name,cols_lst):
    """
    Description: This function will convert the datatype to INT of dataframe columns.
    
    Input parameters: It accepts two input parameters:
    1. df_name: DataFrame whose columns to be changed
    2. cols_lst: List containing the names of the columns
    
    Return: It returns the modified DataFrame
    """
    for col in cols_lst:
        df_name[col] = df_name[col].astype('int')
    return df_name

In [44]:
qscore_vars(autism_df,['Q1_Score','Q2_Score','Q3_Score','Q4_Score','Q5_Score','Q6_Score','Q7_Score','Q8_Score','Q9_Score','Q10_Score'])

Unnamed: 0,age,gender,ethnicity,country,born_with_jaundice,family_member_with_PDD,whos_completing_test,used_screening_app_before,age_type,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Q5_Score,Q6_Score,Q7_Score,Q8_Score,Q9_Score,Q10_Score,screening_score,ASD_Label
0,6,1,Others,Jordan,0,0,Parent,0,4-11 years,1,1,0,0,1,1,0,1,0,0,5.0,NO
1,6,1,Middle Eastern,Jordan,0,0,Parent,0,4-11 years,1,1,0,0,1,1,0,1,0,0,5.0,NO
2,6,1,?,Jordan,0,0,?,1,4-11 years,1,1,0,0,0,1,1,1,0,0,5.0,NO
3,5,0,?,Jordan,1,0,?,0,4-11 years,0,1,0,0,1,1,0,0,0,1,4.0,NO
4,5,1,Others,United States,1,0,Parent,0,4-11 years,1,1,1,1,1,1,1,1,1,1,10.0,YES
5,4,1,?,Egypt,0,1,?,0,4-11 years,0,0,1,0,1,1,0,1,0,1,5.0,NO
6,5,1,White-European,United Kingdom,0,0,Parent,0,4-11 years,1,0,1,1,1,1,0,1,0,1,7.0,YES
7,5,0,Middle Eastern,Bahrain,0,0,Parent,0,4-11 years,1,1,1,1,1,1,1,1,0,0,8.0,YES
8,11,0,Middle Eastern,Bahrain,0,0,Parent,0,4-11 years,1,1,1,1,1,1,1,0,0,0,7.0,YES
9,11,0,?,Austria,0,1,?,0,4-11 years,0,0,1,1,1,0,1,1,0,0,5.0,NO


## ***SCREENING_SCORE variable***

In [45]:
autism_df['screening_score'].dtype

dtype('float64')

In [46]:
autism_df['screening_score'].value_counts()

8.0     44
7.0     44
5.0     41
6.0     40
4.0     33
9.0     32
3.0     21
10.0    21
2.0      9
1.0      6
0.0      1
Name: screening_score, dtype: int64

In [47]:
autism_df['screening_score'] = autism_df['screening_score'].astype('int')

In [48]:
autism_df['screening_score'].dtype

dtype('int32')

In [49]:
autism_df['screening_score'].value_counts()

8     44
7     44
5     41
6     40
4     33
9     32
10    21
3     21
2      9
1      6
0      1
Name: screening_score, dtype: int64

## ***ASD_Label variable***

In [50]:
autism_df['ASD_Label'].dtype

dtype('O')

In [51]:
autism_df['ASD_Label'].value_counts()

NO     151
YES    141
Name: ASD_Label, dtype: int64

#### **Converting the 'ASD_Label' into numeric values where 1 will represents YES and 0 represents NO.**

In [52]:
autism_df['ASD_Label'] = autism_df['ASD_Label'].apply(lambda val: 1 if val=='YES' else 0)

In [53]:
autism_df['ASD_Label'].dtype

dtype('int64')

In [54]:
autism_df['ASD_Label'].value_counts()

0    151
1    141
Name: ASD_Label, dtype: int64

## ***WHOS_COMPLETING_TEST***

In [55]:
autism_df['whos_completing_test'].value_counts()

Parent                      214
?                            43
Relative                     17
Health care professional     13
Self                          4
self                          1
Name: whos_completing_test, dtype: int64

In [56]:
autism_df['whos_completing_test'] = autism_df['whos_completing_test'].str.capitalize()

In [57]:
autism_df['whos_completing_test'].value_counts()

Parent                      214
?                            43
Relative                     17
Health care professional     13
Self                          5
Name: whos_completing_test, dtype: int64

## ***First-hand cleaned DataFrame***

In [58]:
autism_df.head()

Unnamed: 0,age,gender,ethnicity,country,born_with_jaundice,family_member_with_PDD,whos_completing_test,used_screening_app_before,age_type,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Q5_Score,Q6_Score,Q7_Score,Q8_Score,Q9_Score,Q10_Score,screening_score,ASD_Label
0,6,1,Others,Jordan,0,0,Parent,0,4-11 years,1,1,0,0,1,1,0,1,0,0,5,0
1,6,1,Middle Eastern,Jordan,0,0,Parent,0,4-11 years,1,1,0,0,1,1,0,1,0,0,5,0
2,6,1,?,Jordan,0,0,?,1,4-11 years,1,1,0,0,0,1,1,1,0,0,5,0
3,5,0,?,Jordan,1,0,?,0,4-11 years,0,1,0,0,1,1,0,0,0,1,4,0
4,5,1,Others,United States,1,0,Parent,0,4-11 years,1,1,1,1,1,1,1,1,1,1,10,1


In [59]:
autism_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 21 columns):
age                          292 non-null int32
gender                       292 non-null int64
ethnicity                    292 non-null object
country                      292 non-null object
born_with_jaundice           292 non-null int64
family_member_with_PDD       292 non-null int64
whos_completing_test         292 non-null object
used_screening_app_before    292 non-null int64
age_type                     292 non-null object
Q1_Score                     292 non-null int32
Q2_Score                     292 non-null int32
Q3_Score                     292 non-null int32
Q4_Score                     292 non-null int32
Q5_Score                     292 non-null int32
Q6_Score                     292 non-null int32
Q7_Score                     292 non-null int32
Q8_Score                     292 non-null int32
Q9_Score                     292 non-null int32
Q10_Score                  

In [60]:
autism_df.describe()

Unnamed: 0,age,gender,born_with_jaundice,family_member_with_PDD,used_screening_app_before,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Q5_Score,Q6_Score,Q7_Score,Q8_Score,Q9_Score,Q10_Score,screening_score,ASD_Label
count,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0
mean,6.267123,0.712329,0.273973,0.167808,0.037671,0.633562,0.534247,0.743151,0.55137,0.743151,0.712329,0.606164,0.496575,0.493151,0.726027,6.239726,0.482877
std,2.462896,0.453454,0.446761,0.374337,0.190727,0.482658,0.499682,0.437646,0.498208,0.437646,0.453454,0.489438,0.500847,0.500811,0.446761,2.284882,0.500565
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
50%,6.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,6.0,0.0
75%,8.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0
max,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0,1.0


#### **Thus, we got the datatypes of this dataset fixed.**

# ***NULLs Handling***

In [61]:
autism_df.isin(['?']).sum()

age                           0
gender                        0
ethnicity                    43
country                       0
born_with_jaundice            0
family_member_with_PDD        0
whos_completing_test         43
used_screening_app_before     0
age_type                      0
Q1_Score                      0
Q2_Score                      0
Q3_Score                      0
Q4_Score                      0
Q5_Score                      0
Q6_Score                      0
Q7_Score                      0
Q8_Score                      0
Q9_Score                      0
Q10_Score                     0
screening_score               0
ASD_Label                     0
dtype: int64

In [62]:
autism_df[autism_df['ethnicity'] == '?']['country'].unique()

array(['Jordan', 'Egypt', 'Austria', 'Kuwait', 'United Arab Emirates',
       'Syria', 'Saudi Arabia', 'Pakistan', 'Qatar', 'Lebanon', 'Latvia',
       'Russia', 'China', 'Iraq', 'Malaysia', 'Libya'], dtype=object)

In [63]:
autism_df[autism_df['ethnicity'] == '?']['country'].value_counts()

Jordan                  10
Egypt                    7
Russia                   3
Qatar                    3
Pakistan                 3
Saudi Arabia             3
Lebanon                  2
Libya                    2
United Arab Emirates     2
Syria                    2
Latvia                   1
Austria                  1
Kuwait                   1
China                    1
Malaysia                 1
Iraq                     1
Name: country, dtype: int64

### ***Fixing ETHNICITY where COUNTRY is JORDAN***

In [64]:
pd.DataFrame(autism_df[autism_df['country'] == 'Jordan']['ethnicity'].value_counts())

Unnamed: 0,ethnicity
?,10
Middle Eastern,9
Others,1


### ***Fixing ETHNICITY where COUNTRY is EGYPT***

In [65]:
pd.DataFrame(autism_df[autism_df['country'] == 'Egypt']['ethnicity'].value_counts())

Unnamed: 0,ethnicity
?,7
Middle Eastern,2


### ***Fixing ETHNICITY where COUNTRIES are 'Qatar','Saudi Arabia','Russia' and 'Pakistan'***

In [66]:
pd.DataFrame(autism_df[autism_df['country'].isin(['Qatar','Saudi Arabia','Russia','Pakistan'])][['age','ethnicity','country']].groupby(['ethnicity','country'])['age'].count())

Unnamed: 0_level_0,Unnamed: 1_level_0,age
ethnicity,country,Unnamed: 2_level_1
?,Pakistan,3
?,Qatar,3
?,Russia,3
?,Saudi Arabia,3
Asian,Pakistan,1
South Asian,Saudi Arabia,1


### ***Fixing ETHNICITY where COUNTRIES are 'Syria','United Arab Emirates','Lebanon' and 'Libya'***

In [67]:
pd.DataFrame(autism_df[autism_df['country'].isin(['Syria','United Arab Emirates','Lebanon','Libya'])][['age','ethnicity','country']].groupby(['ethnicity','country'])['age'].count())

Unnamed: 0_level_0,Unnamed: 1_level_0,age
ethnicity,country,Unnamed: 2_level_1
?,Lebanon,2
?,Libya,2
?,Syria,2
?,United Arab Emirates,2
Black,United Arab Emirates,1
Middle Eastern,Syria,1
Middle Eastern,United Arab Emirates,4
Others,Libya,1


### ***Fixing ETHNICITY where COUNTRIES are 'China','Kuwait','Iraq','Latvia','Austria' and 'Malaysia'***

In [68]:
pd.DataFrame(autism_df[autism_df['country'].isin(['China','Kuwait','Iraq','Latvia','Austria','Malaysia'])][['age','ethnicity','country']].groupby(['ethnicity','country'])['age'].count())

Unnamed: 0_level_0,Unnamed: 1_level_0,age
ethnicity,country,Unnamed: 2_level_1
?,Austria,1
?,China,1
?,Iraq,1
?,Kuwait,1
?,Latvia,1
?,Malaysia,1
Asian,Malaysia,1
Middle Eastern,Iraq,2
White-European,Austria,1


In [69]:
autism_df[autism_df['ethnicity'] == '?']['country'].unique()

array(['Jordan', 'Egypt', 'Austria', 'Kuwait', 'United Arab Emirates',
       'Syria', 'Saudi Arabia', 'Pakistan', 'Qatar', 'Lebanon', 'Latvia',
       'Russia', 'China', 'Iraq', 'Malaysia', 'Libya'], dtype=object)

In [70]:
country_ethnicity = {'Jordan':'Middle Eastern','Egypt':'Middle Eastern','Pakistan':'Asian','Qatar':'Others','Russia':'Others','Saudi Arabia':'South Asian','Lebanon':'Others',
                    'Libya':'Others','Syria':'Middle Eastern','United Arab Emirates':'Black','Austria':'White-European','China':'Asian','Iraq':'Middle Eastern',
                    'Kuwait':'Others','Latvia':'Others','Malaysia':'Asian'}

In [71]:
def fix_nulls_in_ethnicity(df_name,country_dict):
    """
    Description: This function will update the ETHNICITY variable based upon the COUNTRY.
    
    Input parameters: It accepts two input parameters:
    1. df_name: DataFrame on which action to be performed
    2. country_dict: Dictionary having values of ETHNICITY for COUNTRIES
    
    Return: The modified DataFrame with the new fixed ETHNICITY column.
    """
    df_name['fix_ethnicity'] = df_name[['ethnicity','country']].apply(lambda val: country_dict[val['country']] if val['ethnicity'] == '?' else val['ethnicity'].strip(),axis=1)
    return df_name

In [72]:
fix_nulls_in_ethnicity(autism_df,country_ethnicity)

Unnamed: 0,age,gender,ethnicity,country,born_with_jaundice,family_member_with_PDD,whos_completing_test,used_screening_app_before,age_type,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Q5_Score,Q6_Score,Q7_Score,Q8_Score,Q9_Score,Q10_Score,screening_score,ASD_Label,fix_ethnicity
0,6,1,Others,Jordan,0,0,Parent,0,4-11 years,1,1,0,0,1,1,0,1,0,0,5,0,Others
1,6,1,Middle Eastern,Jordan,0,0,Parent,0,4-11 years,1,1,0,0,1,1,0,1,0,0,5,0,Middle Eastern
2,6,1,?,Jordan,0,0,?,1,4-11 years,1,1,0,0,0,1,1,1,0,0,5,0,Middle Eastern
3,5,0,?,Jordan,1,0,?,0,4-11 years,0,1,0,0,1,1,0,0,0,1,4,0,Middle Eastern
4,5,1,Others,United States,1,0,Parent,0,4-11 years,1,1,1,1,1,1,1,1,1,1,10,1,Others
5,4,1,?,Egypt,0,1,?,0,4-11 years,0,0,1,0,1,1,0,1,0,1,5,0,Middle Eastern
6,5,1,White-European,United Kingdom,0,0,Parent,0,4-11 years,1,0,1,1,1,1,0,1,0,1,7,1,White-European
7,5,0,Middle Eastern,Bahrain,0,0,Parent,0,4-11 years,1,1,1,1,1,1,1,1,0,0,8,1,Middle Eastern
8,11,0,Middle Eastern,Bahrain,0,0,Parent,0,4-11 years,1,1,1,1,1,1,1,0,0,0,7,1,Middle Eastern
9,11,0,?,Austria,0,1,?,0,4-11 years,0,0,1,1,1,0,1,1,0,0,5,0,White-European


In [73]:
autism_df['ethnicity'].value_counts()

White-European     108
Asian               46
?                   43
Middle Eastern      27
South Asian         21
Others              14
Black               14
Latino               8
Hispanic             7
Turkish              2
Pasifika             2
Name: ethnicity, dtype: int64

In [74]:
autism_df['fix_ethnicity'].value_counts()

White-European    109
Asian              51
Middle Eastern     47
Others             26
South Asian        24
Black              16
Latino              8
Hispanic            7
Turkish             2
Pasifika            2
Name: fix_ethnicity, dtype: int64

## ***Fixing WHOS_COMPLETING_TEST***

In [75]:
autism_df.head()

Unnamed: 0,age,gender,ethnicity,country,born_with_jaundice,family_member_with_PDD,whos_completing_test,used_screening_app_before,age_type,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Q5_Score,Q6_Score,Q7_Score,Q8_Score,Q9_Score,Q10_Score,screening_score,ASD_Label,fix_ethnicity
0,6,1,Others,Jordan,0,0,Parent,0,4-11 years,1,1,0,0,1,1,0,1,0,0,5,0,Others
1,6,1,Middle Eastern,Jordan,0,0,Parent,0,4-11 years,1,1,0,0,1,1,0,1,0,0,5,0,Middle Eastern
2,6,1,?,Jordan,0,0,?,1,4-11 years,1,1,0,0,0,1,1,1,0,0,5,0,Middle Eastern
3,5,0,?,Jordan,1,0,?,0,4-11 years,0,1,0,0,1,1,0,0,0,1,4,0,Middle Eastern
4,5,1,Others,United States,1,0,Parent,0,4-11 years,1,1,1,1,1,1,1,1,1,1,10,1,Others


In [76]:
autism_df['whos_completing_test'].value_counts()

Parent                      214
?                            43
Relative                     17
Health care professional     13
Self                          5
Name: whos_completing_test, dtype: int64

In [79]:
pd.set_option('display.max_rows',150)

In [80]:
pd.DataFrame(autism_df.groupby(['country','ethnicity','whos_completing_test'],axis=0)['age'].count())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age
country,ethnicity,whos_completing_test,Unnamed: 3_level_1
Afghanistan,Middle Eastern,Parent,1
Afghanistan,Middle Eastern,Self,1
Argentina,Asian,Parent,1
Armenia,Middle Eastern,Health care professional,1
Armenia,South Asian,Health care professional,1
Armenia,White-European,Parent,1
Australia,Asian,Parent,1
Australia,Black,Parent,1
Australia,Others,Parent,1
Australia,Others,Self,1


#### **As the variable 'WHOS_COMPLETING_TEST' have a paramount difference between 'Parent' and rest of the categories. So, assigning 1 to 'Parent' and 0 to all the others.**

In [81]:
autism_df['fix_whos_completing_test'] = autism_df['whos_completing_test'].apply(lambda val : 1 if val.strip() == 'Parent' else 0)

In [82]:
autism_df['fix_whos_completing_test'].value_counts()

1    214
0     78
Name: fix_whos_completing_test, dtype: int64

## ***Drop the Columns which have missing values and we already fixed them by creating their new columns***

In [89]:
autism_df.drop(['ethnicity','whos_completing_test'],axis=1,inplace=True)

In [90]:
autism_df.head()

Unnamed: 0,age,gender,country,born_with_jaundice,family_member_with_PDD,used_screening_app_before,age_type,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Q5_Score,Q6_Score,Q7_Score,Q8_Score,Q9_Score,Q10_Score,screening_score,ASD_Label,fix_ethnicity,fix_whos_completing_test
0,6,1,Jordan,0,0,0,4-11 years,1,1,0,0,1,1,0,1,0,0,5,0,Others,1
1,6,1,Jordan,0,0,0,4-11 years,1,1,0,0,1,1,0,1,0,0,5,0,Middle Eastern,1
2,6,1,Jordan,0,0,1,4-11 years,1,1,0,0,0,1,1,1,0,0,5,0,Middle Eastern,0
3,5,0,Jordan,1,0,0,4-11 years,0,1,0,0,1,1,0,0,0,1,4,0,Middle Eastern,0
4,5,1,United States,1,0,0,4-11 years,1,1,1,1,1,1,1,1,1,1,10,1,Others,1


## ***Create an ML Algo to predict the missing values in AGE Variable***

In [91]:
autism_df[autism_df['age'] == 0]

Unnamed: 0,age,gender,country,born_with_jaundice,family_member_with_PDD,used_screening_app_before,age_type,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Q5_Score,Q6_Score,Q7_Score,Q8_Score,Q9_Score,Q10_Score,screening_score,ASD_Label,fix_ethnicity,fix_whos_completing_test
32,0,1,Egypt,0,0,0,4-11 years,1,0,0,1,0,1,1,1,1,1,7,1,Middle Eastern,0
65,0,1,Jordan,0,0,0,4-11 years,0,0,1,0,0,0,0,0,0,0,1,0,Middle Eastern,1
126,0,1,Jordan,1,0,0,4-11 years,1,1,1,1,0,1,0,1,0,1,7,1,Middle Eastern,0
138,0,1,Qatar,1,0,1,4-11 years,0,1,1,0,1,1,1,1,0,0,6,0,Others,0


https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779

https://analyticsindiamag.com/5-ways-handle-missing-values-machine-learning-datasets/

https://towardsdatascience.com/the-tale-of-missing-values-in-python-c96beb0e8a9d

https://towardsdatascience.com/working-with-missing-data-in-machine-learning-9c0a430df4ce