# Importing data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
autism_data = pd.read_csv('../input/Toddler Autism dataset July 2018.csv', index_col='Case_No')
autism_data.head()

### **Corresponding Q-chat-10-Toddler Features:**
1. Does your child look at you when you call his/her name?
2. How easy is it for you to get eye contact with your child? 
3. Does your child point to indicate that s/he wants something? (e.g. a toy that is out of reach) 
4. Does your child point to share interest with you? (e.g. pointing at an interesting sight) 
5. Does your child pretend? (e.g. care for dolls, talk on a toy phone) 
6. Does your child follow where you’re looking? 
7. If you or someone else in the family is visibly upset, does your child show signs of warning to comfort them? (e.g. stroking hair, hugging them)
8. Would you describe your child’s first words as: 
9. Does your child use simple gestures? (e.g. wave goodbye) 
10. Does your child stare at nothing with no apparent purpose?

A1-A10: 
Items within Q-Chat-10 in which questions possible answers : “Always, Usually, Sometimes, Rarly & Never” items’ values are mapped to “1” or “0” in the dataset. 
For questions 1-9 (A1-A9) in Q-chat-10, if the respose was  Sometimes / Rarly / Never “1” is assigned to the question (A1-A9). However, for question 10 (A10), if the respose was Always / Usually / Sometimes then “1” is assigned to that question. 
If the user obtained More than 3 Add points together for all ten questions. 
If your child scores more than 3 (Q-chat-10- score) then there is a potential ASD traits otherwise no ASD traits are observed.

http://docs.autismresearchcentre.com/papers/2008_Allison_etal_QCHAT.pdf - **we have to check the compatibility of questions here**

# Exploratory Data Analysis

In [None]:
autism_data.info()

No missing values, just some of them are categorical

In [None]:
autism_data.describe()

In [None]:
autism_data['Sex'][autism_data['Class/ASD Traits '] == 'Yes'].value_counts(normalize=True)*100

Percentage of people male and female who are have a risk of autism

In [None]:
# uncomment a line below if you need percentages
# pd.crosstab(autism_data['Ethnicity'], autism_data['Class/ASD Traits ']).apply(lambda r: r/r.sum()*100, axis=1)
pd.crosstab(autism_data['Ethnicity'], autism_data['Class/ASD Traits '])

People who have risk and do not, sorted by ethnicity

In [None]:
autism_data['Who completed the test'].unique()

Unique values of the 'Who completed the test' column

In [None]:
autism_data['Who completed the test'].value_counts()

Distribution of the 'Who completed the test' column

In [None]:
autism_data['Jaundice'].value_counts()

Distrubution of people who have jaundice

# Data Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='Jaundice', hue='Class/ASD Traits ', data=autism_data)

In [None]:
plt.figure(figsize=(12,6))
sns.lmplot('Age_Mons', 'Qchat-10-Score', data=autism_data, hue='Class/ASD Traits ', fit_reg=True)

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='Ethnicity', hue='Class/ASD Traits ', data=autism_data)

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='Family_mem_with_ASD', hue='Class/ASD Traits ', data=autism_data)

# Machine Learning

### Mapping categorical features

In [None]:
autism_data['Sex'] = autism_data['Sex'].map({'m': 0, 'f': 1})
autism_data['Jaundice'] = autism_data['Jaundice'].map({'no': 0, 'yes': 1})
autism_data['Family_mem_with_ASD'] = autism_data['Family_mem_with_ASD'].map({'no': 0, 'yes': 1})
autism_data['Class/ASD Traits '] = autism_data['Class/ASD Traits '].map({'No': 0, 'Yes': 1})

* Yes - 1
* No - 0

In [None]:
autism_data['Who completed the test'].replace('Health care professional', 'Health Care Professional', inplace=True);

We replaced identical values in the 'Who completed the test' column. They were written in upper and lower cases

In [None]:
autism_data.head()

### Label Encoding of categorical features

In [None]:
object_cols = ['Ethnicity', 'Who completed the test']

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

label_autism_data = autism_data.copy()

for col in object_cols:
    label_autism_data[col] = label_encoder.fit_transform(autism_data[col])

In [None]:
label_autism_data.head()

### Making a train/test split for Machine Learning

In [None]:
X = label_autism_data.drop(['Class/ASD Traits '], axis=1)
y = label_autism_data['Class/ASD Traits ']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=0)

### Finding best parameters for Random Forest Classifier using Grid Search

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

params = {'max_depth': np.arange(1, 11), 'n_estimators': np.arange(100, 1100, 100)}

In [None]:
forest = RandomForestClassifier(random_state=0)

skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=0)

best_forest = GridSearchCV(estimator=forest, param_grid=params, cv=skf, n_jobs=-1, verbose=1)

In [None]:
best_forest.fit(X_train, y_train)
best_forest.best_params_

In [None]:
best_forest.best_estimator_

In [None]:
best_forest.best_score_

### Accuracy of our model (Random Forest Classifier)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, best_forest.predict(X_test))