In [1]:
!pip install numpy
!pip install pandas



In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix


In [6]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns = ['ID', 'Diagnosis'] + [f'Feature_{i}' for i in range(1, 31)]
data = pd.read_csv(url, header=None, names=columns)


In [7]:
data.head()

Unnamed: 0,ID,Diagnosis,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,...,Feature_21,Feature_22,Feature_23,Feature_24,Feature_25,Feature_26,Feature_27,Feature_28,Feature_29,Feature_30
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
data.tail()

Unnamed: 0,ID,Diagnosis,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,...,Feature_21,Feature_22,Feature_23,Feature_24,Feature_25,Feature_26,Feature_27,Feature_28,Feature_29,Feature_30
564,926424,M,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
565,926682,M,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,...,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782
567,927241,M,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
568,92751,B,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


In [9]:
data.shape

(569, 32)

In [10]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          569 non-null    int64  
 1   Diagnosis   569 non-null    object 
 2   Feature_1   569 non-null    float64
 3   Feature_2   569 non-null    float64
 4   Feature_3   569 non-null    float64
 5   Feature_4   569 non-null    float64
 6   Feature_5   569 non-null    float64
 7   Feature_6   569 non-null    float64
 8   Feature_7   569 non-null    float64
 9   Feature_8   569 non-null    float64
 10  Feature_9   569 non-null    float64
 11  Feature_10  569 non-null    float64
 12  Feature_11  569 non-null    float64
 13  Feature_12  569 non-null    float64
 14  Feature_13  569 non-null    float64
 15  Feature_14  569 non-null    float64
 16  Feature_15  569 non-null    float64
 17  Feature_16  569 non-null    float64
 18  Feature_17  569 non-null    float64
 19  Feature_18  569 non-null    f

In [12]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,569.0,30371830.0,125020600.0,8670.0,869218.0,906024.0,8813129.0,911320500.0
Feature_1,569.0,14.12729,3.524049,6.981,11.7,13.37,15.78,28.11
Feature_2,569.0,19.28965,4.301036,9.71,16.17,18.84,21.8,39.28
Feature_3,569.0,91.96903,24.29898,43.79,75.17,86.24,104.1,188.5
Feature_4,569.0,654.8891,351.9141,143.5,420.3,551.1,782.7,2501.0
Feature_5,569.0,0.09636028,0.01406413,0.05263,0.08637,0.09587,0.1053,0.1634
Feature_6,569.0,0.104341,0.05281276,0.01938,0.06492,0.09263,0.1304,0.3454
Feature_7,569.0,0.08879932,0.07971981,0.0,0.02956,0.06154,0.1307,0.4268
Feature_8,569.0,0.04891915,0.03880284,0.0,0.02031,0.0335,0.074,0.2012
Feature_9,569.0,0.1811619,0.02741428,0.106,0.1619,0.1792,0.1957,0.304


In [13]:
data.duplicated().sum()

0

In [14]:
data.isnull().sum()

ID            0
Diagnosis     0
Feature_1     0
Feature_2     0
Feature_3     0
Feature_4     0
Feature_5     0
Feature_6     0
Feature_7     0
Feature_8     0
Feature_9     0
Feature_10    0
Feature_11    0
Feature_12    0
Feature_13    0
Feature_14    0
Feature_15    0
Feature_16    0
Feature_17    0
Feature_18    0
Feature_19    0
Feature_20    0
Feature_21    0
Feature_22    0
Feature_23    0
Feature_24    0
Feature_25    0
Feature_26    0
Feature_27    0
Feature_28    0
Feature_29    0
Feature_30    0
dtype: int64

In [16]:
numerical_cols = data.select_dtypes(include=np.number).columns.tolist()
categorical_cols = data.select_dtypes(include='object').columns.tolist()
print(data[numerical_cols])
print(data[categorical_cols])


           ID  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0      842302      17.99      10.38     122.80     1001.0    0.11840   
1      842517      20.57      17.77     132.90     1326.0    0.08474   
2    84300903      19.69      21.25     130.00     1203.0    0.10960   
3    84348301      11.42      20.38      77.58      386.1    0.14250   
4    84358402      20.29      14.34     135.10     1297.0    0.10030   
..        ...        ...        ...        ...        ...        ...   
564    926424      21.56      22.39     142.00     1479.0    0.11100   
565    926682      20.13      28.25     131.20     1261.0    0.09780   
566    926954      16.60      28.08     108.30      858.1    0.08455   
567    927241      20.60      29.33     140.10     1265.0    0.11780   
568     92751       7.76      24.54      47.92      181.0    0.05263   

     Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_21  Feature_22  \
0      0.27760    0.30010    0.14710     0.2419  ...   