# Can I use a model that predicts the presence of ASD upon the inspection of genes?

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os, sys
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
genes = pd.read_csv('./sfari_genes.csv')

## The status column is useless and so we can drop it!

In [3]:
genes = genes.drop(columns = 'status')

## Because the gene symbol is unique across all 1031 columns, we are going to drop gene-symbol and number of reports as well because it will not help us in this case.

In [4]:
genes = genes.drop(columns = 'gene-symbol')

In [5]:
genes = genes.drop(columns = 'number-of-reports')

## Same with gene name and ensembl id

In [6]:
genes = genes.drop(columns = 'gene-name')

In [7]:
genes = genes.drop(columns = 'ensembl-id')

## Dropping 2 rows where the genetic category is null because it is crucial to analysis

In [8]:
genes.dropna(inplace = True)

## Dummying the columns

In [9]:
genes = pd.get_dummies(data=genes, columns=['chromosome', 'genetic-category','gene-score'])


## Getting the features and labels. 
## The features are all of those except the syndromic column.
## The labels are those in the syndromic column.

In [10]:
features=genes.drop(columns = 'syndromic')
labels=genes.loc[:,'syndromic'].values

## Getting the count of our labels, there are 792 0's, and 239 1's.

In [11]:
genes['syndromic'].value_counts()

0    790
1    239
Name: syndromic, dtype: int64

## Creating our X and y variables.

In [12]:
scaler=MinMaxScaler((-1,1))
X=scaler.fit_transform(features)
y=labels

## Model Baseline

In [13]:
y.mean()

0.23226433430515064

## Train test split

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
model=XGBClassifier()
model.fit(X_train,y_train)
use_label_encoder = False 





## Calculating accuracy for the model

In [16]:
y_pred=model.predict(X_test)
print(accuracy_score(y_test, y_pred)*100)

94.1747572815534


## With 94% Accuracy, I am able to predict the presence of ASD upon the inspection of someones genes