<a href="https://colab.research.google.com/github/rshaikh95/datasci_2_manipulation/blob/main/Week2HWRahilShaikh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



> Healthcare Data Cleaning
### Load Data



In [167]:
import pandas as pd

dfs = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/HHA_507_2023/main/WK2/data/healthcare_data_cleaning.csv')



### Data Interpretation
#### This is a dataset regarding patient demographics, vitals and medical records.
#### There are a large number of rows and small number of columns. The hospital has a large number of patients which are the rows and the columns display specific info about said patient

In [168]:
# Check the size and shape of the DataFrame
print("Number of rows:", dfs.shape[0])
print("Number of columns:", dfs.shape[1])

dfs.columns


Number of rows: 105000
Number of columns: 20


Index(['Patient Age', 'Gender', 'City of Residence', 'State of Residence',
       'Has Insurance', 'Visited Last Month', 'Payment Method',
       'Preferred Doctor', 'Disease Diagnosed', 'Medication Prescribed',
       'Type of Appointment', 'Average Heart Rate', 'Average BP',
       'Height (in cm)', 'Weight (in kg)', 'Payment Due ($)',
       'Last Visit (days ago)', 'Visit Duration (mins)', 'Number of Tests',
       'Prescription Cost ($)'],
      dtype='object')

### Identifying Missing Values
### Missing Values replaced with 0

In [None]:
dfs.replace("missing",0, inplace=True)
dfs

### Data Cleaning: Cleaning Special Characters and White Space

In [170]:
import pandas as pd
import re

# Function to remove white space and special characters from a value
def clean_column_names(df):
    # Define a helper function to clean column names
    def clean_name(name):
        cleaned_name = re.sub(r'[^a-zA-Z0-9]', '', name)
        return cleaned_name.lower()

    # Rename columns using the helper function
    # This is using a list comprehend - e.g., we have a list to the right of the equals sign,
    # and inside the list, we are applying our function, for every col (or X) that exists in df.columns
    df.columns = [clean_name(col) for col in df.columns]
    return df

# Apply the clean_value function to all columns
dfs = clean_column_names(dfs)

dfs.columns




Index(['patientage', 'gender', 'cityofresidence', 'stateofresidence',
       'hasinsurance', 'visitedlastmonth', 'paymentmethod', 'preferreddoctor',
       'diseasediagnosed', 'medicationprescribed', 'typeofappointment',
       'averageheartrate', 'averagebp', 'heightincm', 'weightinkg',
       'paymentdue', 'lastvisitdaysago', 'visitdurationmins', 'numberoftests',
       'prescriptioncost'],
      dtype='object')

### Previewing Dataset to remove any duplicate row or columns

In [None]:
# Identify and remove duplicate rows
dfs.drop_duplicates(inplace=True)

print("Number of rows:", dfs.shape[0])
print("Number of columns:", dfs.shape[1])



# Data Transformation
### Creating New Columns: BP and HR columns for High levels

In [None]:

threshold_high = 140

dfs['BloodPressureStatus'] = dfs['averagebp'].apply(lambda x: 'High' if int(x) >= threshold_high else 'Normal')


print(dfs[['patientage', 'averagebp', 'BloodPressureStatus']].head(10))


In [None]:

HR_high = 160

dfs['HRStatus'] = dfs['averageheartrate'].apply(lambda x: 'Tachycardia' if int(x) >= HR_high else 'Normal')


print(dfs[['patientage', 'averageheartrate', 'HRStatus']].head(10))


### Aggregating Data using groupby and summary stats
### Pivot Table for multidimensional analysis

In [221]:
grouped_data = dfs.groupby('averageheartrate').mean({
    'HRStatus': 'mean'
})
print(grouped_data)


Empty DataFrame
Columns: []
Index: [0, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]

[71 rows x 0 columns]


In [229]:
diagnosis_pivot = dfs.pivot_table(values='patientage', index='diseasediagnosed', columns='gender', aggfunc='count')
print(diagnosis_pivot)



gender              0  Female  Male  Other
diseasediagnosed                          
0                 277    1608  1551   1564
Allergy           952    5993  6117   6001
Cold              979    6023  6013   5947
Covid-19          932    5975  5892   6032
Flu               959    5961  6178   6025
None              901    6114  5977   6029


# Alternative Libraries