<a href="https://colab.research.google.com/github/rshaikh95/datasci_2_manipulation/blob/main/Week2HWRahilShaikh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



> Healthcare Data Cleaning
### Load Data



In [52]:
import pandas as pd

dfs = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/HHA_507_2023/main/WK2/data/healthcare_data_cleaning.csv')



### Data Interpretation
#### This is a dataset regarding patient demographics, vitals and medical records.
#### There are a large number of rows and small number of columns. The hospital has a large number of patients which are the rows and the columns display specific info about said patient

In [None]:
# Check the size and shape of the DataFrame
print("Number of rows:", dfs.shape[0])
print("Number of columns:", dfs.shape[1])

dfs.columns


### Identifying Missing Values
### Missing Values replaced with 0

In [None]:
dfs.replace("missing",0, inplace=True)
dfs

### Data Cleaning: Cleaning Special Characters and White Space

In [55]:
import pandas as pd
import re

# Function to remove white space and special characters from a value
def clean_column_names(df):
    # Define a helper function to clean column names
    def clean_name(name):
        cleaned_name = re.sub(r'[^a-zA-Z0-9]', '', name)
        return cleaned_name.lower()

    # Rename columns using the helper function
    # This is using a list comprehend - e.g., we have a list to the right of the equals sign,
    # and inside the list, we are applying our function, for every col (or X) that exists in df.columns
    df.columns = [clean_name(col) for col in df.columns]
    return df

# Apply the clean_value function to all columns
dfs = clean_column_names(dfs)

dfs.columns




Index(['patientage', 'gender', 'cityofresidence', 'stateofresidence',
       'hasinsurance', 'visitedlastmonth', 'paymentmethod', 'preferreddoctor',
       'diseasediagnosed', 'medicationprescribed', 'typeofappointment',
       'averageheartrate', 'averagebp', 'heightincm', 'weightinkg',
       'paymentdue', 'lastvisitdaysago', 'visitdurationmins', 'numberoftests',
       'prescriptioncost'],
      dtype='object')

### Previewing Dataset to remove any duplicate row or columns

In [56]:
# Identify and remove duplicate rows
dfs.drop_duplicates(inplace=True)

print("Number of rows:", dfs.shape[0])
print("Number of columns:", dfs.shape[1])



Number of rows: 100000
Number of columns: 20


# Data Transformation
### Creating New Columns: BP and HR columns for High levels

In [None]:

threshold_high = 140

dfs['BloodPressureStatus'] = dfs['averagebp'].apply(lambda x: 'High' if int(x) >= threshold_high else 'Normal')


print(dfs[['patientage', 'averagebp', 'BloodPressureStatus']].head(10))


In [None]:

HR_high = 160

dfs['HRStatus'] = dfs['averageheartrate'].apply(lambda x: 'Tachycardia' if int(x) >= HR_high else 'Normal')


print(dfs[['patientage', 'averageheartrate', 'HRStatus']].head(10))


### Aggregating Data using groupby and summary stats
### Pivot Table for multidimensional analysis

In [None]:
grouped_data = dfs.groupby('averageheartrate').mean({
    'HRStatus': 'mean'
})
print(grouped_data)


In [None]:
diagnosis_pivot = dfs.pivot_table(values='patientage', index='diseasediagnosed', columns='gender', aggfunc='count')
print(diagnosis_pivot)



# Alternative Libraries
### Loading Data with Polar and Dask
### Comparing Speed Times

In [None]:

import polars as pl

# Load a dataset of patient records
df1 = pl.read_csv("https://raw.githubusercontent.com/hantswilliams/HHA_507_2023/main/WK2/data/healthcare_data_cleaning.csv")
df1


In [None]:

import dask.dataframe as dd
dff = dd.read_csv('/content/healthcare_data_cleaning.csv')
dff



In [75]:
import polars as pl
import pandas as pd
import time

# Define the dataset path
dataset_path = '/content/Abdul218_Harris789_b0a06ead-cc42-aa48-dad6-841d4aa679fa.json'

# Using Polar
start_time = time.time()
pl_df1 = pl.read_csv('/content/healthcare_data_cleaning.csv')
pl_filtered = pl_df1[pl_df1['Average Heart Rate'] < 100]
polars_average_age = pl_filtered['Patient Age'].mean()
pl_time = time.time() - start_time

# Using Pandas
start_time = time.time()
pd_df2 = pd.read_csv('/content/healthcare_data_cleaning.csv')
pd_filtered = pd_df2[pandas_df2['Average Heart Rate'] < 100]
pd_average_age = pd_filtered['Patient Age'].mean()
pd_time = time.time() - start_time

# Print results and execution times
print(f"Polars Average Age of Patients with Flu: {pl_average_age:.2f} years")
print(f"Pandas Average Age of Patients with Flu: {pd_average_age:.2f} years")
print(f"Polars Execution Time: {pl_time:.4f} seconds")
print(f"Pandas Execution Time: {pd:.4f} seconds")

ValueError: ignored