In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv(R"C:\Users\Admin\Downloads\NSE\hospital_outpatient_DataSet.csv")
df

Unnamed: 0,Visit_ID,Date,Department,Doctor_Name,Patient_Age,Gender,Symptoms,Diagnosis,Medicine_Cost,Consultation_Fee,Followup_Required,Visit_Duration,Total_Bill
0,V1170,01-01-2024,Cardiology,Dr. Ahmed,61.0,F,Allergy,Hypertension,1674.0,263.0,No,77,1937
1,V1221,01-01-2024,Cardiology,Dr. Ahmed,67.0,M,Fatigue,Migraine,764.0,359.0,Yes,62,1123
2,V1016,01-01-2024,Cardiology,Dr. Meena,48.0,F,Allergy,Fracture,1085.0,727.0,Yes,87,1812
3,V1205,03-01-2024,Orthopedics,Dr. Meena,34.0,F,,Hypertension,1893.0,323.0,No,120,2216
4,V1212,03-01-2024,Orthopedics,Dr. Meena,29.0,F,Fever,Infection,1056.0,357.0,Yes,18,1413
...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,V1259,29-12-2024,Dermatology,Dr. Ahmed,44.0,M,Pain,Infection,722.0,103.0,Yes,52,825
296,V1141,30-12-2024,ENT,Dr. Banerjee,49.0,M,Rash,Flu,,346.0,Yes,61,652
297,V1120,30-12-2024,Dermatology,Dr. Rao,20.0,F,,Hypertension,1206.0,343.0,Yes,94,1549
298,V1047,31-12-2024,Orthopedics,Dr. Ahmed,38.0,F,Rash,Infection,1500.0,768.0,Yes,30,2268


In [3]:
df.head()

Unnamed: 0,Visit_ID,Date,Department,Doctor_Name,Patient_Age,Gender,Symptoms,Diagnosis,Medicine_Cost,Consultation_Fee,Followup_Required,Visit_Duration,Total_Bill
0,V1170,01-01-2024,Cardiology,Dr. Ahmed,61.0,F,Allergy,Hypertension,1674.0,263.0,No,77,1937
1,V1221,01-01-2024,Cardiology,Dr. Ahmed,67.0,M,Fatigue,Migraine,764.0,359.0,Yes,62,1123
2,V1016,01-01-2024,Cardiology,Dr. Meena,48.0,F,Allergy,Fracture,1085.0,727.0,Yes,87,1812
3,V1205,03-01-2024,Orthopedics,Dr. Meena,34.0,F,,Hypertension,1893.0,323.0,No,120,2216
4,V1212,03-01-2024,Orthopedics,Dr. Meena,29.0,F,Fever,Infection,1056.0,357.0,Yes,18,1413


In [4]:
df.shape

(300, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Visit_ID           300 non-null    object 
 1   Date               300 non-null    object 
 2   Department         300 non-null    object 
 3   Doctor_Name        300 non-null    object 
 4   Patient_Age        291 non-null    float64
 5   Gender             295 non-null    object 
 6   Symptoms           285 non-null    object 
 7   Diagnosis          285 non-null    object 
 8   Medicine_Cost      285 non-null    float64
 9   Consultation_Fee   293 non-null    float64
 10  Followup_Required  285 non-null    object 
 11  Visit_Duration     300 non-null    int64  
 12  Total_Bill         300 non-null    int64  
dtypes: float64(3), int64(2), object(8)
memory usage: 30.6+ KB


In [6]:
df.isna().sum().sort_values(ascending=False)

Symptoms             15
Followup_Required    15
Medicine_Cost        15
Diagnosis            15
Patient_Age           9
Consultation_Fee      7
Gender                5
Date                  0
Visit_ID              0
Doctor_Name           0
Department            0
Visit_Duration        0
Total_Bill            0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

In [11]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', errors='coerce')

# Fill missing numeric values with median
df['Patient_Age'].fillna(df['Patient_Age'].median(), inplace=True)
df['Medicine_Cost'].fillna(df['Medicine_Cost'].median(), inplace=True)
df['Consultation_Fee'].fillna(df['Consultation_Fee'].median(), inplace=True)

# Fill missing categorical values
df['Symptoms'].fillna('Unknown', inplace=True)
df['Diagnosis'].fillna('Unknown', inplace=True)
df['Followup_Required'].fillna(df['Followup_Required'].mode()[0], inplace=True)
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

# Strip whitespaces
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()

# Remove duplicates
df.drop_duplicates(inplace=True)

# Standardize department names
df['Department'] = df['Department'].str.title()

print("Missing values after cleaning:", df.isna().sum().sum())

Missing values after cleaning: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Patient_Age'].fillna(df['Patient_Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Medicine_Cost'].fillna(df['Medicine_Cost'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm

In [9]:
# High Bill Flag
df['High_Bill_Flag'] = np.where(df['Total_Bill'] > 2000, 'High', 'Normal')

# Bill Check column
df['Bill_Check'] = df['Medicine_Cost'] + df['Consultation_Fee']

# Age Group column
df['Age_Group'] = pd.cut(
    df['Patient_Age'],
    bins=[0, 18, 40, 60, 100],
    labels=['Minor', 'Young Adult', 'Middle Aged', 'Senior']
)

display(df.head())

Unnamed: 0,Visit_ID,Date,Department,Doctor_Name,Patient_Age,Gender,Symptoms,Diagnosis,Medicine_Cost,Consultation_Fee,Followup_Required,Visit_Duration,Total_Bill,High_Bill_Flag,Bill_Check,Age_Group
0,V1170,2024-01-01,Cardiology,Dr. Ahmed,61.0,F,Allergy,Hypertension,1674.0,263.0,No,77,1937,Normal,1937.0,Senior
1,V1221,2024-01-01,Cardiology,Dr. Ahmed,67.0,M,Fatigue,Migraine,764.0,359.0,Yes,62,1123,Normal,1123.0,Senior
2,V1016,2024-01-01,Cardiology,Dr. Meena,48.0,F,Allergy,Fracture,1085.0,727.0,Yes,87,1812,Normal,1812.0,Middle Aged
3,V1205,2024-01-03,Orthopedics,Dr. Meena,34.0,F,Unknown,Hypertension,1893.0,323.0,No,120,2216,High,2216.0,Young Adult
4,V1212,2024-01-03,Orthopedics,Dr. Meena,29.0,F,Fever,Infection,1056.0,357.0,Yes,18,1413,Normal,1413.0,Young Adult


In [10]:
print("--- TASK 4: INSIGHTS ---")

# Average bill per department
dept_bill = df.groupby('Department')['Total_Bill'].agg(['mean', 'count'])
display(dept_bill)

# Top 3 doctors
top_doctors = df['Doctor_Name'].value_counts().head(3)
display(top_doctors)

# Follow-up rate
followup_rate = df['Followup_Required'].value_counts(normalize=True)
display(followup_rate)

--- TASK 4: INSIGHTS ---


Unnamed: 0_level_0,mean,count
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Cardiology,1611.461538,65
Dermatology,1535.44898,49
Ent,1606.116883,77
General,1634.482143,56
Orthopedics,1627.641509,53


Doctor_Name
Dr. Ahmed    67
Dr. Meena    65
Dr. Rao      61
Name: count, dtype: int64

Followup_Required
No     0.556667
Yes    0.443333
Name: proportion, dtype: float64

In [12]:
X_reg = df[['Department', 'Patient_Age', 'Medicine_Cost',
            'Consultation_Fee', 'Visit_Duration']]
y_reg = df['Total_Bill']

X_train, X_test, y_train, y_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

reg_preprocessor = ColumnTransformer([
    ('num', StandardScaler(),
     ['Patient_Age', 'Medicine_Cost', 'Consultation_Fee', 'Visit_Duration']),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['Department'])
])

reg_model = Pipeline([
    ('preprocessor', reg_preprocessor),
    ('regressor', LinearRegression())
])

reg_model.fit(X_train, y_train)

predictions = reg_model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)

print("Mean Absolute Error (MAE):", round(mae, 2))

Mean Absolute Error (MAE): 36.12


In [13]:
X_clf = df[['Department', 'Gender', 'Symptoms',
            'Diagnosis', 'Total_Bill', 'Visit_Duration']]
y_clf = df['Followup_Required'].map({'Yes': 1, 'No': 0})

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

clf_preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ['Total_Bill', 'Visit_Duration']),
    ('cat', OneHotEncoder(handle_unknown='ignore'),
     ['Department', 'Gender', 'Symptoms', 'Diagnosis'])
])

clf_model = Pipeline([
    ('preprocessor', clf_preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

clf_model.fit(X_train_c, y_train_c)
y_pred = clf_model.predict(X_test_c)

print("Confusion Matrix:\n", confusion_matrix(y_test_c, y_pred))
print("Precision:", round(precision_score(y_test_c, y_pred), 2))
print("Recall:", round(recall_score(y_test_c, y_pred), 2))

Confusion Matrix:
 [[22 14]
 [16  8]]
Precision: 0.36
Recall: 0.33



- General Department generates the highest average bill.
- Dr. Ahmed handles the maximum patient volume.
- Linear Regression achieved low MAE, indicating accurate bill prediction.
- Logistic Regression recall is moderate, showing scope for improvement.
- Dataset size (300 records) is a limitation for generalization.

In [18]:
df.to_excel(r"C:\Users\Admin\Downloads\NSE\cleaneddataset.xlsx", index=False)
print("Cleaned dataset exported successfully.")

Cleaned dataset exported successfully.
