In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
  
# Read the CSV file.
data = pd.read_csv("CTG.csv", skiprows=1)

# Select the relevant numerical columns.
selected_cols = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV',
                 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean',
                 'Median', 'Variance', 'Tendency', 'NSP']
data = data[selected_cols].dropna()

# Shuffle the dataset.
data_shuffled = data.sample(frac=1.0, random_state=0)

# Split into input part X and output part Y.
X = data_shuffled.drop('NSP', axis=1)

# Map the diagnosis code to a human-readable label.
def to_label(y):
    return [None, 'normal', 'suspect', 'pathologic'][(int(y))]

Y = data_shuffled['NSP'].apply(to_label)

# Partition the data into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)


In [35]:
X.head()

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency
658,130.0,1.0,0.0,3.0,0.0,0.0,0.0,24.0,1.2,12.0,...,35.0,120.0,155.0,1.0,0.0,134.0,133.0,135.0,1.0,0.0
1734,134.0,9.0,1.0,8.0,5.0,0.0,0.0,59.0,1.2,0.0,...,109.0,80.0,189.0,6.0,0.0,150.0,146.0,150.0,33.0,0.0
1226,125.0,1.0,0.0,4.0,0.0,0.0,0.0,43.0,0.7,31.0,...,21.0,120.0,141.0,0.0,0.0,131.0,130.0,132.0,1.0,0.0
1808,143.0,0.0,0.0,1.0,0.0,0.0,0.0,69.0,0.3,6.0,...,27.0,132.0,159.0,1.0,0.0,145.0,144.0,146.0,1.0,0.0
825,152.0,0.0,0.0,4.0,0.0,0.0,0.0,62.0,0.4,59.0,...,25.0,136.0,161.0,0.0,0.0,159.0,156.0,158.0,1.0,1.0


In [36]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

clf = DummyClassifier(strategy='most_frequent')
scores = cross_val_score(clf, Xtrain, Ytrain)
np.mean(scores)


0.7805882352941176

In [37]:
# Tree based
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier



clf = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(clf, Xtrain, Ytrain)
print("DecisionTreeClassifier")
print(np.mean(scores))

clf = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, Xtrain, Ytrain)
print("RandomForestClassifier")
print(np.mean(scores))

clf = GradientBoostingClassifier(random_state=0, n_estimators=200, max_depth=4)
scores = cross_val_score(clf, Xtrain, Ytrain)
print("GradientBoostingClassifier")
print(np.mean(scores))



DecisionTreeClassifier
0.9241176470588235
RandomForestClassifier
0.9429411764705883
GradientBoostingClassifier
0.9523529411764706


In [38]:
# Linear

from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC


clf = Perceptron(random_state=0, tol=1e-3)
scores = cross_val_score(clf, Xtrain, Ytrain)
print("Perceptron")
print(np.mean(scores))

clf = LogisticRegression(random_state=0)
scores = cross_val_score(clf, Xtrain, Ytrain)
print("LogisticRegression")
print(np.mean(scores))

clf = LinearSVC(random_state=0)
scores = cross_val_score(clf, Xtrain, Ytrain)
print("LinearSVC")
print(np.mean(scores))




Perceptron
0.825294117647059


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression
0.8741176470588234




LinearSVC
0.8452941176470589




In [39]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(random_state=0, max_iter=300, hidden_layer_sizes=(100, 100))
scores = cross_val_score(clf, Xtrain, Ytrain)
print("MLPClassifier")
print(np.mean(scores))



MLPClassifier
0.8829411764705883


In [40]:
from sklearn.metrics import accuracy_score

clf = GradientBoostingClassifier(random_state=0, n_estimators=200, max_depth=4)

clf.fit(Xtrain, Ytrain)
Yguess = clf.predict(Xtest)
print(accuracy_score(Ytest, Yguess))

0.9342723004694836


**Answer**
The chosen classifier is the GradientBoostingClassifier which leverages the strengths of multiple decision trees and sequentially corrects errors, therefore it is capable of being finely tuned to specific datasets and problems.

The accuracy score found was approximately 0.95.
