In [249]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import roc_curve,auc

In [39]:
df = pd.read_csv('breast_cancer.csv')
df.tail(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
564,926424,M,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
565,926682,M,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,...,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782
567,927241,M,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
568,92751,B,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


In [40]:
df['diagnosis'] = LabelEncoder().fit_transform(df['diagnosis'])

In [41]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [42]:
# df.describe()
# df.info()

In [90]:
corrmat = df.corr()  # correlation matrix
corrmat.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [91]:
corrmat_sorted = corrmat.sort_values(by='diagnosis', ascending=False)
corrmat_sorted

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,1.0,0.730029,0.415185,0.742636,0.708984,0.35856,0.596534,0.69636,0.776614,0.330499,...,0.776454,0.456903,0.782914,0.733825,0.421465,0.590998,0.65961,0.793566,0.416294,0.323872
concave points_worst,0.793566,0.744214,0.295316,0.771241,0.722017,0.503053,0.815573,0.861323,0.910155,0.430297,...,0.787424,0.359755,0.816322,0.747419,0.547691,0.80108,0.855434,1.0,0.502528,0.511114
perimeter_worst,0.782914,0.965137,0.35804,0.970387,0.95912,0.238853,0.59021,0.729565,0.855923,0.219169,...,0.993708,0.365098,1.0,0.977578,0.236775,0.529408,0.618344,0.816322,0.269493,0.138957
concave points_mean,0.776614,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661
radius_worst,0.776454,0.969539,0.352573,0.969476,0.962746,0.21312,0.535315,0.688236,0.830318,0.185728,...,1.0,0.359921,0.993708,0.984015,0.216574,0.47582,0.573975,0.787424,0.243529,0.093492
perimeter_mean,0.742636,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
area_worst,0.733825,0.941082,0.343546,0.94155,0.959213,0.206718,0.509604,0.675987,0.80963,0.177193,...,0.984015,0.345842,0.977578,1.0,0.209145,0.438296,0.543331,0.747419,0.209146,0.079647
radius_mean,0.730029,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
area_mean,0.708984,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
concavity_mean,0.69636,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493


In [92]:
cols = corrmat.nlargest(10, 'diagnosis')

In [93]:
y = df.values[:,0]
X = df.drop('diagnosis', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = LogisticRegression()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

clf = SGDClassifier()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

In [96]:
clf.coef_, clf.intercept_

(array([[-1.18371617e+03, -6.22290687e+02, -6.26584932e+03,
         -1.71283827e+03, -6.60769597e+00,  2.30231483e+01,
          4.69815380e+01,  1.87799533e+01, -1.27561936e+01,
         -6.75440859e+00,  2.96610977e+00,  2.65687333e+00,
          1.46996538e+02,  3.23185936e+03,  1.90603481e-01,
          6.54343921e+00,  9.78339976e+00,  2.23823498e+00,
          6.21672596e-01,  3.27871998e-01, -1.25632004e+03,
         -3.67732817e+02, -5.76832042e+03,  2.70048278e+03,
         -6.02745839e+00,  7.78315335e+01,  1.18718563e+02,
          3.05814452e+01, -5.95137213e+00, -1.42514293e+00]]),
 array([-1309.5133509]))

In [97]:
df.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [139]:
# complete code

df = pd.read_csv('breast_cancer.csv')
df['diagnosis'] = LabelEncoder().fit_transform(df['diagnosis'])

y = df.values[:,1]
X = df.drop('diagnosis', axis=1).values
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = LogisticRegression()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

clf = SGDClassifier()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.6228070175438597
0.6228070175438597


In [243]:
# remove 'id' column
df = pd.read_csv('breast_cancer.csv')
df['diagnosis'] = LabelEncoder().fit_transform(df['diagnosis'])

df = df.drop('id', axis=1)
X, y = df.values[:,1:], df.values[:,0]
# np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)
clf = LogisticRegression()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
print(clf.predict(X_test))
clf.predict_proba(X_test)

# clf = SGDClassifier()
# clf.fit(X_train, y_train)
# print(clf.score(X_train, y_train))
# print(clf.predict(X_test))
# clf.predict_proba(X_test)

1.0
[0. 0. 0. 0. 0. 0.]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[0.987542  , 0.012458  ],
       [0.95748056, 0.04251944],
       [0.96775652, 0.03224348],
       [0.98531423, 0.01468577],
       [0.99744964, 0.00255036],
       [0.97513723, 0.02486277]])

In [244]:
predict = clf.predict(X_test)
prob_score = clf.predict_proba(X_test)

In [245]:
predict

array([0., 0., 0., 0., 0., 0.])

In [246]:
prob_score

array([[0.987542  , 0.012458  ],
       [0.95748056, 0.04251944],
       [0.96775652, 0.03224348],
       [0.98531423, 0.01468577],
       [0.99744964, 0.00255036],
       [0.97513723, 0.02486277]])

In [247]:
prob_score[:,1]

array([0.012458  , 0.04251944, 0.03224348, 0.01468577, 0.00255036,
       0.02486277])

In [248]:
df = pd.DataFrame()
df['predict'] = predict
df['prob'] = prob_score[:,1]
df

Unnamed: 0,predict,prob
0,0.0,0.012458
1,0.0,0.042519
2,0.0,0.032243
3,0.0,0.014686
4,0.0,0.00255
5,0.0,0.024863


In [255]:
# fpr = dict()
# tpr = dict()
# roc_curve = dict()
fpr, tpr, _ = roc_curve(y_test, df['prob'])  ###########
roc_auc = auc(fpr, tpr)

TypeError: 'dict' object is not callable