1. Import Packages

In [127]:
import pandas as pd                                                       
import numpy as np                                                      
from xgboost import XGBClassifier                                                     
from sklearn.tree import  DecisionTreeClassifier                                                     
from sklearn.svm import SVC                                                   
from sklearn.ensemble import  RandomForestClassifier, GradientBoostingClassifier                                                     
from sklearn.linear_model import LogisticRegression  
from sklearn.model_selection import train_test_split                                                     
from sklearn.metrics import  mean_squared_error, mean_absolute_error, accuracy_score, confusion_matrix                                                      
print("All Packages Imported")

All Packages Imported


2. Read CSV Files.

In [128]:
df = pd.read_csv("wisconsin_breast_cancer.csv")
df

Unnamed: 0,id,thickness,size,shape,adhesion,single,nuclei,chromatin,nucleoli,mitosis,class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0
2,1015425,3,1,1,1,2,2.0,3,1,1,0
3,1016277,6,8,8,1,3,4.0,3,7,1,0
4,1017023,4,1,1,3,2,1.0,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2.0,1,1,1,0
695,841769,2,1,1,1,2,1.0,1,1,1,0
696,888820,5,10,10,3,7,3.0,8,10,2,1
697,897471,4,8,6,4,3,4.0,10,6,1,1


3. Check Information.

In [129]:
df.info


<bound method DataFrame.info of           id  thickness  size  shape  adhesion  single  nuclei  chromatin  \
0    1000025          5     1      1         1       2     1.0          3   
1    1002945          5     4      4         5       7    10.0          3   
2    1015425          3     1      1         1       2     2.0          3   
3    1016277          6     8      8         1       3     4.0          3   
4    1017023          4     1      1         3       2     1.0          3   
..       ...        ...   ...    ...       ...     ...     ...        ...   
694   776715          3     1      1         1       3     2.0          1   
695   841769          2     1      1         1       2     1.0          1   
696   888820          5    10     10         3       7     3.0          8   
697   897471          4     8      6         4       3     4.0         10   
698   897471          4     8      8         5       4     5.0         10   

     nucleoli  mitosis  class  
0          

4. Check duplicated values

In [130]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
694    False
695    False
696    False
697    False
698    False
Length: 699, dtype: bool

5. View Statical Data.

In [131]:

df.describe()

Unnamed: 0,id,thickness,size,shape,adhesion,single,nuclei,chromatin,nucleoli,mitosis,class
count,699.0,699.0,699.0,699.0,699.0,699.0,683.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.544656,3.437768,2.866953,1.589413,0.344778
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,3.643857,2.438364,3.053634,1.715078,0.475636
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,1.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


6. Check Datatypes.

In [132]:
df.dtypes

id             int64
thickness      int64
size           int64
shape          int64
adhesion       int64
single         int64
nuclei       float64
chromatin      int64
nucleoli       int64
mitosis        int64
class          int64
dtype: object

7. Check Datasets Missing Values.

In [133]:
missing_percent = df.isnull().mean() * 100

print(missing_percent)

id           0.000000
thickness    0.000000
size         0.000000
shape        0.000000
adhesion     0.000000
single       0.000000
nuclei       2.288984
chromatin    0.000000
nucleoli     0.000000
mitosis      0.000000
class        0.000000
dtype: float64


In [134]:
df.isnull().sum()

id            0
thickness     0
size          0
shape         0
adhesion      0
single        0
nuclei       16
chromatin     0
nucleoli      0
mitosis       0
class         0
dtype: int64

8. Change Columns Lower To Upper Case.

In [135]:
df.columns = df.columns.str.replace("_", " ").str.title()

9. Drop Missing Columns.

In [136]:
data = df.dropna()

In [137]:
df.columns


Index(['Id', 'Thickness', 'Size', 'Shape', 'Adhesion', 'Single', 'Nuclei',
       'Chromatin', 'Nucleoli', 'Mitosis', 'Class'],
      dtype='object')

10. Change The Class type to Malignant, Begin.

In [138]:
df['Class'] = df['Class'].replace({1 : 'Malignant', 0 : 'Benign'})

In [139]:
df['Class'].value_counts()

Class
Benign       458
Malignant    241
Name: count, dtype: int64

In [150]:
df.head(2)

Unnamed: 0,Id,Thickness,Size,Shape,Adhesion,Single,Nuclei,Chromatin,Nucleoli,Mitosis,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,Benign
1,1002945,5,4,4,5,7,10.0,3,2,1,Benign


11. Split it X and Y

In [140]:
X = df.drop(['Id','Class'], axis=1)
y = df['Class']

In [141]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=42)

12. Model Selection, Train, Predictions.

In [142]:
model = RandomForestClassifier(n_estimators=500, random_state=42)
model.fit(x_train, y_train)
predict = model.predict(x_test)

13. Check Model Accuracy.

In [143]:
print("Accuracy Score: ",accuracy_score(y_test, predict))
print("Confusion Matrix: ",confusion_matrix(y_test, predict))


Accuracy Score:  0.9591836734693877
Confusion Matrix:  [[313   7]
 [ 13 157]]


14. Actual VS Predict.

In [144]:
result = pd.DataFrame({
    'Actual' : y_test,
    'Predict' : predict
})

result.iloc[0:20]

Unnamed: 0,Actual,Predict
158,Benign,Benign
499,Benign,Benign
396,Benign,Benign
155,Malignant,Malignant
321,Benign,Benign
212,Benign,Benign
234,Benign,Benign
289,Malignant,Malignant
300,Malignant,Malignant
356,Malignant,Benign


# Decision Tree Model.

1. Model Selection, Train, Predictions.

In [146]:
model_decission = DecisionTreeClassifier()
model_decission.fit(x_train, y_train)
predict_decission = model_decission.predict(x_test)

2. Check Model Accuracy.

In [145]:
print(accuracy_score(y_test, predict_decission))
print(confusion_matrix(y_test, predict_decission))

0.9510204081632653
[[315   5]
 [ 19 151]]


3. Actual VS Predict.

In [147]:
result_decission = pd.DataFrame({
    'Actual' : y_test,
    'Predict' : predict
})

result_decission.iloc[0:30]

Unnamed: 0,Actual,Predict
158,Benign,Benign
499,Benign,Benign
396,Benign,Benign
155,Malignant,Malignant
321,Benign,Benign
212,Benign,Benign
234,Benign,Benign
289,Malignant,Malignant
300,Malignant,Malignant
356,Malignant,Benign
