In [None]:
import pandas as pd

df = pd.read_csv(r'C:\Users\advocate\Desktop\RGI\Group Assignment\healthcare_dataset.csv')
print(df.head())


In [None]:
# Display basic info
print(df.info())

# Summary statistics for numerical columns
print(df.describe())

# Check for missing values
print(df.isnull().sum())


In [None]:
# Fill missing values or drop rows/columns with missing values
df.fillna({'Discharge Date': 'Unknown', 'Medication': 'None'}, inplace=True)

# Convert 'Date of Admission' and 'Discharge Date' to datetime
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], errors='coerce')

# Convert 'Billing Amount' to numeric if needed
df['Billing Amount'] = pd.to_numeric(df['Billing Amount'], errors='coerce')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Age distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], bins=20, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Gender distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='Gender', data=df)
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

# Blood Type distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='Blood Type', data=df)
plt.title('Blood Type Distribution')
plt.xlabel('Blood Type')
plt.ylabel('Count')
plt.show()

# Billing Amount by Hospital
plt.figure(figsize=(12, 8))
sns.boxplot(x='Hospital', y='Billing Amount', data=df)
plt.xticks(rotation=45)
plt.title('Billing Amount by Hospital')
plt.xlabel('Hospital')
plt.ylabel('Billing Amount')
plt.show()

# Average billing amount by medical condition
plt.figure(figsize=(12, 8))
average_billing = df.groupby('Medical Condition')['Billing Amount'].mean().sort_values()
sns.barplot(x=average_billing.index, y=average_billing.values)
plt.xticks(rotation=90)
plt.title('Average Billing Amount by Medical Condition')
plt.xlabel('Medical Condition')
plt.ylabel('Average Billing Amount')
plt.show()


In [None]:
# Length of stay
df['Length of Stay'] = (df['Discharge Date'] - df['Date of Admission']).dt.days

# Example: Add a feature indicating if the billing amount is high
df['High Billing'] = df['Billing Amount'] > df['Billing Amount'].median()


In [None]:
# Correlation matrix
corr = df[['Age', 'Billing Amount', 'Length of Stay']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Prepare features and target variable
X = df[['Age', 'Length of Stay']]
y = df['Billing Amount']

# Handle missing values in target variable
X = X[y.notnull()]
y = y.dropna()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features if necessary
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_scaled)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))


In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual Billing Amount')
plt.ylabel('Predicted Billing Amount')
plt.title('Actual vs Predicted Billing Amount')
plt.show()


In [None]:
df.to_csv(r'C:\Users\advocate\Desktop\RGI\Group Assignment\clean_healthcare_dataset.csv', index=False)