In [22]:
import numpy as np
import pandas as pd
import pickle
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix

In [18]:
param_grid = {
    'max_depth': [5, 10, 15, None],           
    'min_samples_split': [2, 5, 10],          
    'min_samples_leaf': [1, 2, 5],            
    'max_features': ['sqrt', 'log2', None]    
}

In [19]:
grid_search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid = param_grid, cv = 5, verbose = 2,
                   n_jobs = -1)

In [2]:
#leukemia = pd.read_csv("leukemia_frame.csv")

In [3]:
leukemia_pickle_path = "leukemia.pkl"
with open(leukemia_pickle_path,"rb") as f:
    leukemia = pickle.load(f)

In [4]:
leukemia.head()

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel50168,pixel50169,pixel50170,pixel50171,pixel50172,pixel50173,pixel50174,pixel50175,pixel50176,diagnosis
0,0.003922,0.003922,0.003922,0.003922,0.003922,0.007843,0.0,0.003922,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,benign
1,0.031373,0.388235,0.035294,0.019608,0.003922,0.0,0.007843,0.003922,0.015686,0.007843,...,0.019608,0.556863,0.745098,0.764706,0.760784,0.772549,0.745098,0.733333,0.709804,benign
2,0.003922,0.007843,0.003922,0.003922,0.015686,0.011765,0.023529,0.498039,0.513725,0.505882,...,0.015686,0.003922,0.007843,0.003922,0.011765,0.003922,0.003922,0.003922,0.007843,benign
3,0.007843,0.003922,0.011765,0.058824,0.05098,0.031373,0.007843,0.015686,0.003922,0.015686,...,0.282353,0.105882,0.258824,0.019608,0.007843,0.003922,0.011765,0.007843,0.003922,benign
4,0.011765,0.035294,0.541176,0.541176,0.470588,0.494118,0.478431,0.494118,0.345098,0.035294,...,0.027451,0.011765,0.0,0.129412,0.027451,0.003922,0.015686,0.007843,0.011765,benign


In [5]:
# shuffle the data frame 
leukemia = shuffle(leukemia,random_state = 42)

In [6]:
leukemia.head()

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel50168,pixel50169,pixel50170,pixel50171,pixel50172,pixel50173,pixel50174,pixel50175,pixel50176,diagnosis
2351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.627451,0.666667,0.627451,0.596078,0.643137,0.639216,0.505882,0.023529,0.003922,pre
134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,benign
2298,0.015686,0.184314,0.647059,0.647059,0.639216,0.72549,0.717647,0.709804,0.643137,0.627451,...,0.003922,0.007843,0.015686,0.015686,0.019608,0.192157,0.109804,0.062745,0.509804,pre
1610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pre
990,0.07451,0.015686,0.015686,0.023529,0.019608,0.003922,0.015686,0.0,0.027451,0.129412,...,0.003922,0.003922,0.0,0.003922,0.007843,0.007843,0.011765,0.388235,0.011765,early


In [7]:
# split the data into inputs and labels

X = leukemia.drop(columns = "diagnosis")
y_label = leukemia["diagnosis"]

In [8]:
one_hot_y = pd.get_dummies(y_label,dtype=float)
one_hot_y

Unnamed: 0,benign,early,pre,pro
2351,0.0,0.0,1.0,0.0
134,1.0,0.0,0.0,0.0
2298,0.0,0.0,1.0,0.0
1610,0.0,0.0,1.0,0.0
990,0.0,1.0,0.0,0.0
...,...,...,...,...
1095,0.0,1.0,0.0,0.0
1130,0.0,1.0,0.0,0.0
1294,0.0,1.0,0.0,0.0
860,0.0,1.0,0.0,0.0


In [9]:
one_hot_y_array = np.array(one_hot_y)
X_array = np.array(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_array, one_hot_y_array,  
    test_size=0.2,
    random_state=42,)

In [23]:
X_train_sparse = csr_matrix(X_train)

In [25]:
print(X_train_sparse)

  (0, 80)	0.0117647058823529
  (0, 81)	0.0078431372549019
  (0, 83)	0.0078431372549019
  (0, 84)	0.0078431372549019
  (0, 85)	0.0078431372549019
  (0, 86)	0.0039215686274509
  (0, 87)	0.0039215686274509
  (0, 88)	0.0078431372549019
  (0, 89)	0.0039215686274509
  (0, 90)	0.0039215686274509
  (0, 91)	0.0039215686274509
  (0, 92)	0.0039215686274509
  (0, 93)	0.0235294117647058
  (0, 94)	0.7294117647058823
  (0, 95)	0.6823529411764706
  (0, 96)	0.6666666666666666
  (0, 97)	0.6784313725490196
  (0, 98)	0.6588235294117647
  (0, 99)	0.6392156862745098
  (0, 100)	0.6274509803921569
  (0, 101)	0.6745098039215687
  (0, 102)	0.6980392156862745
  (0, 103)	0.6862745098039216
  (0, 104)	0.6745098039215687
  (0, 105)	0.6627450980392157
  :	:
  (2603, 50151)	0.0078431372549019
  (2603, 50152)	0.0352941176470588
  (2603, 50153)	0.3294117647058823
  (2603, 50154)	0.1019607843137254
  (2603, 50155)	0.0274509803921568
  (2603, 50156)	0.3333333333333333
  (2603, 50157)	0.0352941176470588
  (2603, 50158)	0.

In [None]:
tree_classifier = tree.DecisionTreeClassifier(random_state=42)

In [29]:
tree_model =tree_classifier.fit(X_train_sparse,y_train)

In [30]:
y_hat = tree_model.predict(X_test)

In [31]:
acc = accuracy_score(y_test,y_hat )

In [33]:
acc

0.5230061349693251