In [394]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree

In [377]:
df = pd.read_csv("breast-cancer.data", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [378]:
cols = ['class','age', 'menopause','tumor_size','inv-nodes','node-capes','deg-malig','breast','breast-quad', 'irradiant']
df.columns = cols
df.head()

Unnamed: 0,class,age,menopause,tumor_size,inv-nodes,node-capes,deg-malig,breast,breast-quad,irradiant
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [379]:
df.dtypes

class          object
age            object
menopause      object
tumor_size     object
inv-nodes      object
node-capes     object
deg-malig       int64
breast         object
breast-quad    object
irradiant      object
dtype: object

In [380]:
df.isnull().sum()

class          0
age            0
menopause      0
tumor_size     0
inv-nodes      0
node-capes     0
deg-malig      0
breast         0
breast-quad    0
irradiant      0
dtype: int64

In [381]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,1:], df.iloc[:,0], test_size= 0.2, random_state=40)
print(x_train.shape)
print(x_test.shape)

(228, 9)
(58, 9)


In [382]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers= [('trf1', OrdinalEncoder(categories= [['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44','45-49', '50-54', '55-59'],
                                 ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '24-26']]), [ 'tumor_size', 'inv-nodes']),
                        ('trf2', OneHotEncoder(sparse=False, drop = 'first'), ['age','menopause','node-capes','breast','breast-quad','irradiant'])],
                                remainder= 'passthrough')


In [383]:

x_train = transformer.fit_transform(x_train)
x_test = transformer.transform(x_test)

print(x_train.shape, x_test.shape)



(228, 19) (58, 19)




In [384]:
le = LabelEncoder()
le.fit_transform(y_train)
le.fit_transform(y_test)

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0])

In [385]:
model = Perceptron()
model.fit(x_train, y_train)

In [386]:
print(x_train.shape)
print(x_test.shape)

y_pred = model.predict(x_test)

(228, 19)
(58, 19)


In [387]:
accuracy_score(y_test, y_pred)

0.7241379310344828

In [391]:
model_2 = LogisticRegression()
model_2.fit(x_train, y_train)

In [392]:
print(x_train.shape)
print(x_test.shape)

y_pred = model_2.predict(x_test)

(228, 19)
(58, 19)


In [393]:
accuracy_score(y_test, y_pred)

0.7068965517241379

In [395]:
model_3 = tree.DecisionTreeClassifier()
model_3.fit(x_train, y_train)
y_pred = model_3.predict(x_test)
accuracy_score(y_test, y_pred)

0.6896551724137931