In [1]:
import pandas as pd

# Not limiting the column number when displaying dataframe
pd.set_option("display.max_columns", None)

In [3]:
from sklearn import preprocessing

df = pd.read_csv("Autism_Data.arff", sep = r',', skipinitialspace = True)

df = df.fillna(0)
df = df.drop(["A7_Score"], axis = 1)
df = df.drop(["used_app_before"], axis = 1)

df = df.rename(columns = {"austim": "autism", "contry_of_res": "country"})
df = df.replace("yes", 1)
df = df.replace("no", 0)
df = df.replace("f", 1)
df = df.replace("m", 0)
df = df.replace("YES", 1)
df = df.replace("NO", 0)
df = df.replace("?", 0)

# Typecasting
df["ethnicity"] = df["ethnicity"].astype(str)
df["country"] = df["country"].astype(str)
df["age_desc"] = df["age_desc"].astype(str)
df["relation"] = df["relation"].astype(str)

# Initializing Encoder
number = preprocessing.LabelEncoder()

# Encoding
df["ethnicity"] = number.fit_transform(df["ethnicity"])
df["country"] = number.fit_transform(df["country"])
df["age_desc"] = number.fit_transform(df["age_desc"])
df["relation"] = number.fit_transform(df["relation"])

df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,autism,country,result,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,0,0,26,1,10,0,0,10,6,0,5,0
1,1,1,0,1,0,0,1,0,1,24,0,6,0,1,25,5,0,5,0
2,1,1,0,1,1,0,1,1,1,27,0,6,1,1,61,8,0,3,1
3,1,1,0,1,0,0,1,0,1,35,1,10,0,1,10,6,0,5,0
4,1,0,0,0,0,0,1,0,0,40,1,2,0,0,32,2,0,1,0


In [4]:
df.shape

(704, 19)

In [5]:
X = df.drop(["autism"], axis = 1).values
y = df["autism"].values


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.25)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver = "liblinear")
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8977272727272727


In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(multi_class = "multinomial", solver = "newton-cg")
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8920454545454546


In [9]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion = "entropy", max_depth = 5)
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8636363636363636


In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(criterion = "entropy")
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8863636363636364

In [11]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors = 15)
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.9090909090909091


In [12]:
from sklearn.svm import SVC

model = SVC(kernel = "linear", gamma = 0.1, C = 0.1)
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.9090909090909091
