In [1]:
!pip install -r ../requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Data Processing and Exploratory Data Analysis

In [4]:
import pandas as pd
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
kidney_df = pd.read_csv(
    "kidney_data.csv"
)

kidney_df.tail(10)

Unnamed: 0,PatientID,Age,Gender,Ethnicity,SocioeconomicStatus,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,...,Itching,QualityOfLifeScore,HeavyMetalsExposure,OccupationalExposureChemicals,WaterQuality,MedicalCheckupsFrequency,MedicationAdherence,HealthLiteracy,Diagnosis,DoctorInCharge
1649,1650,89,0,0,1,2,22.859626,1,12.654568,2.130131,...,8.344253,70.945313,0,0,1,2.002629,9.742499,4.646004,0,Confidential
1650,1651,32,0,0,1,3,35.253136,0,4.841773,1.037494,...,1.771158,65.378786,0,0,0,0.191474,2.812123,4.533731,1,Confidential
1651,1652,42,1,1,1,3,21.65396,0,8.735558,3.434101,...,2.524631,93.200328,0,0,0,3.738646,3.088596,1.684372,0,Confidential
1652,1653,20,0,0,1,3,20.378015,1,14.809062,7.459221,...,5.813276,63.415192,0,1,0,0.447521,0.448998,9.833712,1,Confidential
1653,1654,73,1,0,1,3,35.634449,0,8.929558,6.260773,...,4.135998,72.032441,0,0,0,2.816752,4.974069,3.44831,1,Confidential
1654,1655,90,0,0,1,2,39.677059,1,1.370151,4.157954,...,2.138976,81.102765,0,0,0,0.951836,9.547583,2.046212,0,Confidential
1655,1656,34,0,0,2,1,28.922015,0,3.372073,9.647525,...,7.911566,10.600428,0,1,0,3.604147,1.609847,0.324417,0,Confidential
1656,1657,84,0,0,2,3,21.951219,0,15.825955,7.349964,...,0.015531,69.633427,0,0,0,0.801955,5.768617,4.935108,0,Confidential
1657,1658,90,0,0,2,2,24.964149,0,12.967462,0.618614,...,3.432765,31.858023,0,0,0,0.560298,2.744519,0.322592,1,Confidential
1658,1659,34,1,1,0,0,19.253258,1,11.39651,7.446314,...,9.293499,82.314878,0,0,0,1.754852,0.1864,4.553608,1,Confidential


In [9]:
print(kidney_df.shape)

kidney_df.info()

(1659, 54)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1659 entries, 0 to 1658
Data columns (total 54 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   PatientID                      1659 non-null   int64  
 1   Age                            1659 non-null   int64  
 2   Gender                         1659 non-null   int64  
 3   Ethnicity                      1659 non-null   int64  
 4   SocioeconomicStatus            1659 non-null   int64  
 5   EducationLevel                 1659 non-null   int64  
 6   BMI                            1659 non-null   float64
 7   Smoking                        1659 non-null   int64  
 8   AlcoholConsumption             1659 non-null   float64
 9   PhysicalActivity               1659 non-null   float64
 10  DietQuality                    1659 non-null   float64
 11  SleepQuality                   1659 non-null   float64
 12  FamilyHistoryKidneyDisease     1659 n

## Querying & Filtering the DataFrame

In [25]:
# ########################################################## #
# Query by Diet Quality --> Higher than Average Diet Quality
# ########################################################## #

print(
    kidney_df["DietQuality"].min(), kidney_df["DietQuality"].mean(), 
    kidney_df["DietQuality"].max()
)

# Retrieve all observations of patients with higher than average diet qualities
high_avg_diet = kidney_df.loc[kidney_df["DietQuality"] > kidney_df["DietQuality"].mean(), :]

# Sorting by Diet Quality with Pandas
high_avg_diet = high_avg_diet.sort_values(
    by = ["DietQuality", "BMI"],  # Primary sort = DietQuality, Secondary Sort = BMI
    ascending = False
)

print(len(high_avg_diet))

high_avg_diet.head(5)


0.0024066974290226 5.02854440376087 9.998926667050492
824


Unnamed: 0,PatientID,Age,Gender,Ethnicity,SocioeconomicStatus,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,...,Itching,QualityOfLifeScore,HeavyMetalsExposure,OccupationalExposureChemicals,WaterQuality,MedicalCheckupsFrequency,MedicationAdherence,HealthLiteracy,Diagnosis,DoctorInCharge
530,531,76,0,3,0,0,28.918308,0,2.328057,4.307053,...,7.728682,49.728583,1,0,0,1.173581,6.611295,2.62363,1,Confidential
1241,1242,22,0,2,0,2,18.804365,0,19.082612,1.943321,...,8.253317,54.125824,0,1,0,0.888851,1.193062,2.450995,1,Confidential
1304,1305,49,0,0,0,2,24.170147,1,2.295919,6.555659,...,3.350466,95.634399,0,0,0,2.128776,3.365662,3.624931,1,Confidential
1065,1066,64,0,0,2,1,28.195184,0,2.316943,9.388921,...,6.424534,68.528323,0,0,0,2.644891,1.54289,2.469135,1,Confidential
884,885,66,1,2,2,2,18.352991,1,19.302814,1.021723,...,2.181207,91.165301,0,0,0,3.506112,9.553413,1.339884,1,Confidential
