<a href="https://colab.research.google.com/github/senuji-03/FDM_MiniAssignment/blob/main/FDM_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(42)

# Create dataset
data = {
    "PatientID": range(1, 101),
    "Age": np.random.randint(0, 90, size=100),
    "Weight_kg": np.random.randint(40, 120, size=100).astype(float),
    "BloodPressure": np.random.randint(80, 180, size=100),
    "CholesterolLevel": np.random.randint(150, 300, size=100),
    "Gender": np.random.choice(["Male", "Female"], size=100),
    "Diabetic": np.random.choice([0, 1], size=100),  # 0 = No, 1 = Yes
    "Smoker": np.random.choice([0, 1], size=100)
}

df = pd.DataFrame(data)

# Introduce some missing values
df.loc[np.random.choice(df.index, 8, replace=False), "Weight_kg"] = np.nan
df.loc[np.random.choice(df.index, 5, replace=False), "CholesterolLevel"] = np.nan

# Introduce noisy/outlier values
df.loc[np.random.choice(df.index, 3, replace=False), "Weight_kg"] = -50  # Invalid weight

print(df.head(10))




   PatientID  Age  Weight_kg  BloodPressure  CholesterolLevel  Gender  \
0          1   51      110.0            169             294.0  Female   
1          2   14       48.0            111             277.0  Female   
2          3   71       40.0            149             182.0  Female   
3          4   60        NaN            111             264.0    Male   
4          5   20      102.0            147             268.0  Female   
5          6   82       50.0            134             171.0  Female   
6          7   86       47.0            154             187.0  Female   
7          8   74       74.0            135             258.0  Female   
8          9   74       74.0             96             200.0    Male   
9         10   87       72.0            117             157.0    Male   

   Diabetic  Smoker  
0         1       0  
1         1       1  
2         1       0  
3         0       0  
4         1       0  
5         0       0  
6         1       1  
7         1       1 

In [7]:
# Fill missing Age with mean
df["Age"].fillna(df["Age"].mean(), inplace=True)

# Fill missing Weight with median
df["Weight_kg"].fillna(df["Weight_kg"].median(), inplace=True)

# Fill missing Cholesterol with median
df["CholesterolLevel"].fillna(df["CholesterolLevel"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Weight_kg"].fillna(df["Weight_kg"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [8]:
# Handle noisy/outlier data (negative weight)
df.loc[df["Weight_kg"] < 0, "Weight_kg"] = df["Weight_kg"].median()

In [9]:
print(df.head(10))

   PatientID  Age  Weight_kg  BloodPressure  CholesterolLevel  Gender  \
0          1   51      110.0            169             294.0  Female   
1          2   14       48.0            111             277.0  Female   
2          3   71       40.0            149             182.0  Female   
3          4   60       76.0            111             264.0    Male   
4          5   20      102.0            147             268.0  Female   
5          6   82       50.0            134             171.0  Female   
6          7   86       47.0            154             187.0  Female   
7          8   74       74.0            135             258.0  Female   
8          9   74       74.0             96             200.0    Male   
9         10   87       72.0            117             157.0    Male   

   Diabetic  Smoker  
0         1       0  
1         1       1  
2         1       0  
3         0       0  
4         1       0  
5         0       0  
6         1       1  
7         1       1 

In [10]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Normalize Weight and Cholesterol
scaler = MinMaxScaler()
df[["Weight_norm", "Cholesterol_norm"]] = scaler.fit_transform(df[["Weight_kg", "CholesterolLevel"]])

# Z-score normalization of Age
scaler = StandardScaler()
df["Age_zscore"] = scaler.fit_transform(df[["Age"]])

# Discretize Age into groups
df["AgeGroup"] = pd.cut(df["Age"], bins=[0, 18, 40, 60, 90], labels=["Child", "Young Adult", "Adult", "Senior"])

# One-Hot Encoding for Gender
df = pd.get_dummies(df, columns=["Gender"], prefix="Gender")


In [12]:
# Drop irrelevant column
df_reduced = df.drop(columns=["PatientID"])

# PCA Example (combine numerical features)
from sklearn.decomposition import PCA
features = ["Age", "Weight_kg", "BloodPressure", "CholesterolLevel"]
pca = PCA(n_components=1)
df["PCA_Feature"] = pca.fit_transform(df[features])
print(df.head(10))


   PatientID  Age  Weight_kg  BloodPressure  CholesterolLevel  Diabetic  \
0          1   51      110.0            169             294.0         1   
1          2   14       48.0            111             277.0         1   
2          3   71       40.0            149             182.0         1   
3          4   60       76.0            111             264.0         0   
4          5   20      102.0            147             268.0         1   
5          6   82       50.0            134             171.0         0   
6          7   86       47.0            154             187.0         1   
7          8   74       74.0            135             258.0         1   
8          9   74       74.0             96             200.0         1   
9         10   87       72.0            117             157.0         0   

   Smoker  Weight_norm  Cholesterol_norm  Age_zscore     AgeGroup  \
0       0     0.897436          0.965753    0.159147        Adult   
1       1     0.102564          0.84

In [14]:
# Hospital ward information
ward_data = pd.DataFrame({
    "PatientID": range(1, 101),
    "Ward": np.random.choice(["General", "ICU", "Maternity", "Surgery"], size=100)
})

# Merge with main dataset
df = df.merge(ward_data, on="PatientID")

print(df.head(10))


   PatientID  Age  Weight_kg  BloodPressure  CholesterolLevel  Diabetic  \
0          1   51      110.0            169             294.0         1   
1          2   14       48.0            111             277.0         1   
2          3   71       40.0            149             182.0         1   
3          4   60       76.0            111             264.0         0   
4          5   20      102.0            147             268.0         1   
5          6   82       50.0            134             171.0         0   
6          7   86       47.0            154             187.0         1   
7          8   74       74.0            135             258.0         1   
8          9   74       74.0             96             200.0         1   
9         10   87       72.0            117             157.0         0   

   Smoker  Weight_norm  Cholesterol_norm  Age_zscore     AgeGroup  \
0       0     0.897436          0.965753    0.159147        Adult   
1       1     0.102564          0.84