In [1]:
###importing the necessary libraries to be used in the process of analyzing the data and creation of model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
###importing data
data = pd.read_csv("data.csv")

In [3]:
"""
now as we have imported the data, it's time to better understand the data like what is the shape,
size,features etc in our data
"""
print(data.head(5))
print("----------------------______________________________________---------------------------------")
print(data.info())
print("----------------------______________________________________---------------------------------")
print(data.shape)

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

      ...       texture_worst  perimeter_worst  area_worst  sm

In [4]:
"""
as we can understand from the above findings that there are 569 enteries in the data and a total of
33 columns
there are not any missing values, just a column "unnamed:32" with  "Nan" values
there are some of the columns like "unnamed:32", "id" which are not needed for the prediction so we
are going to drop them
"""
data.drop("id", axis = 1, inplace = True)
print(data.columns)
print(data.info())

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float

In [5]:
data.drop("Unnamed: 32", axis = 1, inplace = True)
print(data.columns)
print(data.shape)

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')
(569, 31)


In [6]:
###now as we can see diagnosis is of object type so we can convert in into numeric type
data['diagnosis'] = data['diagnosis'].map({'M':1, 'B':0})

In [7]:
print(data.diagnosis)

0      1
1      1
2      1
3      1
4      1
5      1
6      1
7      1
8      1
9      1
10     1
11     1
12     1
13     1
14     1
15     1
16     1
17     1
18     1
19     0
20     0
21     0
22     1
23     1
24     1
25     1
26     1
27     1
28     1
29     1
      ..
539    0
540    0
541    0
542    0
543    0
544    0
545    0
546    0
547    0
548    0
549    0
550    0
551    0
552    0
553    0
554    0
555    0
556    0
557    0
558    0
559    0
560    0
561    0
562    1
563    1
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64


In [8]:
print(data.head())

   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0          1        17.99         10.38          122.80     1001.0   
1          1        20.57         17.77          132.90     1326.0   
2          1        19.69         21.25          130.00     1203.0   
3          1        11.42         20.38           77.58      386.1   
4          1        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean           ...             radius_worst  texture_worst  \
0         0.2419           ...            

In [9]:
y = data['diagnosis']
data.drop('diagnosis', axis = 1, inplace = True)

In [10]:
print(y.head(20))

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    0
Name: diagnosis, dtype: int64


In [11]:
print(y.shape)
print(data.shape)

(569,)
(569, 30)


In [12]:
#now splitting our dataset into train set and test set
X_train, X_test, y_train, y_test = train_test_split(data, y, random_state = 0, test_size = 0.3)

In [13]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(398, 30)
(398,)
(171, 30)
(171,)


In [14]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
###LOGISTIC REGRESSION

In [24]:
## Logistic regression
reg = LogisticRegression()
model = reg.fit(X_train, y_train)

In [25]:
prediction = model.predict(X_test)

In [26]:
reg_accuracy = accuracy_score(y_test, prediction)
print(reg_accuracy)
# accuracy of 0.9766081871345029

0.9766081871345029


In [27]:
reg_matrix = confusion_matrix(y_test, prediction)
print(reg_matrix)
# matrix [[107   1]
#        [  3  60]]

[[107   1]
 [  3  60]]


###SVM

In [20]:
##svm
mac = SVC()
model = mac.fit(X_train, y_train)

In [21]:
prediction = model.predict(X_test)

In [22]:
svm_accuracy = accuracy_score(y_test, prediction)
print(svm_accuracy)
# accuracy of 0.9766081871345029


0.9766081871345029


In [23]:
svm_matrix = confusion_matrix(y_test, prediction)
print(svm_matrix)

# matrix[[107   1]
#       [  3  60]]

[[107   1]
 [  3  60]]


###RANDOM FOREST CLASSIFIER

In [15]:
forest = RandomForestClassifier(n_estimators=100)

In [16]:
model = forest.fit(X_train, y_train)

In [17]:
prediction = model.predict(X_test)

In [18]:
forest_accuracy = accuracy_score(y_test, prediction)
print(forest_accuracy)
# accuracy 0.9649122807017544

0.9532163742690059


In [19]:
forest_matrix = confusion_matrix(y_test, prediction)
print(forest_matrix)
#[[104   4]
 #[  2  61]]

[[104   4]
 [  4  59]]
