## Data Wrangling 

[ Datawrangline ] (https://towardsdatascience.com/data-wrangling-with-pandas-5b0be151df4e)

### Data exploration

In [9]:
import pandas as pd
file = './pandas_data_wrangling/data/breast_cancer_data.csv'
df = pd.read_csv(file)
df.dtypes



patient_id                 int64
clump_thickness          float64
cell_size_uniformity     float64
cell_shape_uniformity      int64
marginal_adhesion          int64
single_ep_cell_size        int64
bare_nuclei               object
bland_chromatin          float64
normal_nucleoli          float64
mitoses                    int64
class                     object
doctor_name               object
dtype: object

In [10]:
df.describe()

Unnamed: 0,patient_id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bland_chromatin,normal_nucleoli,mitoses
count,699.0,698.0,698.0,699.0,699.0,699.0,695.0,698.0,699.0
mean,1071704.0,4.416905,3.137536,3.207439,2.793991,3.216023,3.447482,2.868195,1.589413
std,617095.7,2.817673,3.052575,2.971913,2.843163,2.2143,2.441191,3.055647,1.715078
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0
75%,1238298.0,6.0,5.0,5.0,3.5,4.0,5.0,4.0,1.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [17]:
df.groupby(['class','doctor_name']).size() # dataframe.size() returns the count of rows

class      doctor_name
benign     Dr. Doe        127
           Dr. Lee        121
           Dr. Smith      102
           Dr. Wong       108
malignant  Dr. Doe         58
           Dr. Lee         60
           Dr. Smith       74
           Dr. Wong        49
dtype: int64

In [35]:
df.isna().sum() # finding missing values 
df.shape

(690, 12)

In [20]:
df = df.dropna(axis =0, how = 'anfy' )  # droppibng row by axis = 0 (row axis)

In [22]:
df.nunique()

patient_id               637
clump_thickness           10
cell_size_uniformity      10
cell_shape_uniformity     10
marginal_adhesion         10
single_ep_cell_size       10
bare_nuclei               11
bland_chromatin           10
normal_nucleoli           10
mitoses                    9
class                      2
doctor_name                4
dtype: int64

In [24]:
df[df.duplicated(subset = 'patient_id', keep =False)].sort_values('patient_id')


Unnamed: 0,patient_id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
267,320675,3.0,3.0,5,2,3,10,7.0,1.0,1,malignant,Dr. Wong
272,320675,3.0,3.0,5,2,3,10,7.0,1.0,1,malignant,Dr. Smith
575,385103,5.0,1.0,2,1,2,1,3.0,1.0,1,benign,Dr. Smith
269,385103,1.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Doe
271,411453,5.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Wong
607,411453,1.0,1.0,1,1,2,1,1.0,1.0,1,benign,Dr. Smith
684,466906,1.0,1.0,1,1,2,1,1.0,1.0,1,benign,Dr. Wong
683,466906,1.0,1.0,1,1,2,1,1.0,1.0,1,benign,Dr. Lee
371,493452,1.0,1.0,3,1,2,1,1.0,1.0,1,benign,Dr. Smith
372,493452,4.0,1.0,2,1,2,1,2.0,1.0,1,benign,Dr. Wong


In [26]:
repeat_patients = df.groupby(by = 'patient_id').size().sort_values(ascending =False)
repeat_patients

patient_id
1182404    6
1276091    5
1198641    3
1033078    2
1238777    2
1143978    2
466906     2
1321942    2
493452     2
1320077    2
560680     2
1116192    2
1116116    2
1115293    2
1299924    2
1299596    2
1114570    2
654546     2
1293439    2
1105524    2
1100524    2
411453     2
695091     2
704097     2
1017023    2
733639     2
734111     2
1070935    2
1212422    2
769612     2
          ..
1205579    1
1205138    1
1204898    1
1204558    1
1204242    1
1203096    1
1202812    1
1202253    1
1202125    1
1201936    1
1208301    1
1211202    1
1217952    1
1211265    1
1217717    1
1217264    1
1217051    1
1216947    1
1216694    1
1214966    1
1214556    1
1214092    1
1213784    1
1213383    1
1213375    1
1213273    1
1212251    1
1212232    1
1211594    1
61634      1
Length: 637, dtype: int64

In [36]:
filtered_patients = repeat_patients[repeat_patients > 2].to_frame().reset_index()
filtered_patients

Unnamed: 0,patient_id,0
0,1182404,6
1,1276091,5
2,1198641,3


In [34]:
filtered_df = df[~df.patient_id.isin(filtered_patients.patient_id)]
filtered_df.shape

(676, 12)

## Reshaping

In [44]:
categorical_df = df[['patient_id','doctor_name']]
categorical_df['doctor_count'] = 1 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [45]:
doctors_one_hot_encoded = pd.pivot_table( categorical_df,
                                  index = categorical_df.index, 
                                  columns = ['doctor_name'], 
                                   values = ['doctor_count'] )
doctors_one_hot_encoded = doctors_one_hot_encoded.fillna(0)


In [47]:
doctors_one_hot_encoded.columns = doctors_one_hot_encoded.columns.droplevel()
doctors_one_hot_encoded

ValueError: Cannot remove 1 levels from an index with 1 levels: at least one level must be left.


[duplicated](https://www.geeksforgeeks.org/python-pandas-dataframe-duplicated/)