In [1]:
import pandas as pd
import numpy as np

## Leitura do arquivo

In [2]:
data = pd.read_csv('breast_cancer_data.csv',sep=',')
data.head()

Unnamed: 0,1000025,5.0,1.0,1,1.1,2,1.2,3.0,1.0.1,1.3,benign,Dr. Doe
0,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
1,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
2,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
3,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong
4,1017122,8.0,10.0,10,8,7,10,9.0,7.0,1,malignant,Dr. Smith


### não temos nome de coluna, então precisamos fornecer esses nomes e passar como parâmetro na função read_csv

In [3]:
colunas = ['id','clump_thickness','cell_size_uniformity','cell_shape_uniformity','marginal_adhesion','single_ep_cell_size',
          'bare_nuclei','bland_chromatin','normal_nucleoli','mitoses','class','doctor_name']

data = pd.read_csv('breast_cancer_data.csv',names=colunas)
data.head()

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
0,1000025,5.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Doe
1,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
2,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
3,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
4,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong


In [4]:
len(colunas)

12

### vamos entender nossos dados

In [5]:
data.dtypes

id                         int64
clump_thickness          float64
cell_size_uniformity     float64
cell_shape_uniformity      int64
marginal_adhesion          int64
single_ep_cell_size        int64
bare_nuclei               object
bland_chromatin          float64
normal_nucleoli          float64
mitoses                    int64
class                     object
doctor_name               object
dtype: object

In [6]:
data.describe()

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bland_chromatin,normal_nucleoli,mitoses
count,699.0,698.0,698.0,699.0,699.0,699.0,695.0,698.0,699.0
mean,1071704.0,4.416905,3.137536,3.207439,2.793991,3.216023,3.447482,2.868195,1.589413
std,617095.7,2.817673,3.052575,2.971913,2.843163,2.2143,2.441191,3.055647,1.715078
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0
75%,1238298.0,6.0,5.0,5.0,3.5,4.0,5.0,4.0,1.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 12 columns):
id                       699 non-null int64
clump_thickness          698 non-null float64
cell_size_uniformity     698 non-null float64
cell_shape_uniformity    699 non-null int64
marginal_adhesion        699 non-null int64
single_ep_cell_size      699 non-null int64
bare_nuclei              697 non-null object
bland_chromatin          695 non-null float64
normal_nucleoli          698 non-null float64
mitoses                  699 non-null int64
class                    699 non-null object
doctor_name              699 non-null object
dtypes: float64(4), int64(5), object(3)
memory usage: 65.6+ KB


### com valores caategóricos, podemos agrupar

In [8]:
data.groupby(by =['class','doctor_name']).size()

class      doctor_name
benign     Dr. Doe        127
           Dr. Lee        121
           Dr. Smith      102
           Dr. Wong       108
malignant  Dr. Doe         58
           Dr. Lee         60
           Dr. Smith       74
           Dr. Wong        49
dtype: int64

### Lidando com valores faltantes

In [9]:
data.isna().sum()

id                       0
clump_thickness          1
cell_size_uniformity     1
cell_shape_uniformity    0
marginal_adhesion        0
single_ep_cell_size      0
bare_nuclei              2
bland_chromatin          4
normal_nucleoli          1
mitoses                  0
class                    0
doctor_name              0
dtype: int64

In [10]:
data = data.dropna(axis = 0, how = 'any')

### Lidando com valores duplicados

In [11]:
data[data.duplicated()]

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
258,1198641,3.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Lee


In [12]:
data = data.drop_duplicates()

In [13]:
data[data.duplicated()]

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name


In [14]:
repeat_patients = data.groupby(by = 'id').size().sort_values(ascending =False)
repeat_patients

id
1182404    6
1276091    5
1100524    2
1293439    2
654546     2
1114570    2
1299596    2
1299924    2
1115293    2
1116116    2
1116192    2
560680     2
1320077    2
493452     2
1321942    2
466906     2
1143978    2
1105524    2
1168736    2
411453     2
695091     2
704097     2
733639     2
734111     2
1070935    2
1212422    2
769612     2
1061990    2
798429     2
1218860    2
          ..
1205579    1
1205138    1
1204898    1
1204558    1
1204242    1
1203096    1
1202812    1
1202253    1
1202125    1
1201936    1
1208301    1
1211202    1
1217952    1
1211265    1
1217717    1
1217264    1
1217051    1
1216947    1
1216694    1
1214966    1
1214556    1
1214092    1
1213784    1
1213383    1
1213375    1
1213273    1
1212251    1
1212232    1
1211594    1
61634      1
Length: 637, dtype: int64

### Filtrando dados

In [16]:
filtered_patients = repeat_patients[repeat_patients > 2].to_frame().reset_index()
filtered_data = data[~data.id.isin(filtered_patients.id)]

In [17]:
filtered_data

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
0,1000025,5.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Doe
1,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
2,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
3,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
4,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong
5,1017122,8.0,10.0,10,8,7,10,9.0,7.0,1,malignant,Dr. Smith
7,1018561,2.0,1.0,2,1,2,1,3.0,1.0,1,benign,Dr. Smith
8,1033078,2.0,1.0,1,1,2,1,1.0,1.0,5,benign,Dr. Smith
9,1033078,4.0,2.0,1,1,2,1,2.0,1.0,1,benign,Dr. Doe
10,1035283,1.0,1.0,1,1,1,1,3.0,1.0,1,benign,Dr. Doe


In [18]:
repeat_patients = filtered_data.groupby(by = 'id').size().sort_values(ascending =False)
repeat_patients

id
1168736    2
1116192    2
411453     2
1143978    2
1218860    2
466906     2
798429     2
1321942    2
493452     2
1061990    2
1320077    2
769612     2
1212422    2
1070935    2
560680     2
1116116    2
385103     2
1115293    2
734111     2
1114570    2
1299924    2
1299596    2
733639     2
1198641    2
704097     2
695091     2
654546     2
1277792    2
1105524    2
1293439    2
          ..
1205579    1
1205138    1
1204898    1
1204558    1
1204242    1
1203096    1
1202812    1
1202253    1
1202125    1
1201936    1
1208301    1
1211202    1
1217952    1
1211265    1
1217717    1
1217264    1
1217051    1
1216947    1
1216694    1
1214966    1
1214556    1
1214092    1
1213784    1
1213383    1
1213375    1
1213273    1
1212251    1
1212232    1
1211594    1
61634      1
Length: 635, dtype: int64