# Data Preprocessing Pipeline

This notebook outlines the preprocessing steps for the patient readmission risk prediction model. The preprocessing includes data cleaning, feature engineering, and data visualization.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')

In [2]:
# Load the dataset
data = pd.read_csv('../data/sample_data.csv')
data.head()

In [3]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values[missing_values > 0]

In [4]:
# Handle missing values
# Example: Fill missing values with the mean for numerical columns
for column in data.select_dtypes(include=[np.number]).columns:
    data[column].fillna(data[column].mean(), inplace=True)

# Example: Fill missing values with the mode for categorical columns
for column in data.select_dtypes(include=[object]).columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

In [5]:
# Feature engineering
# Example: Create a new feature based on existing data
data['new_feature'] = data['feature1'] / data['feature2']  # Example calculation

In [6]:
# Data visualization
plt.figure(figsize=(10, 6))
sns.countplot(x='readmission', data=data)
plt.title('Patient Readmission Count')
plt.xlabel('Readmission Status')
plt.ylabel('Count')
plt.show()

In [7]:
# Save the cleaned data
data.to_csv('../data/cleaned_data.csv', index=False)
print('Cleaned data saved successfully!')