In [2]:
# File Path: preprocessing.ipynb

# Step 1: Import Libraries
import pandas as pd
import numpy as np

# Step 2: Load Datasets
data_file = "../dataset/data.csv"
schemes_file = "../dataset/schemes.csv"

data_df = pd.read_csv(data_file)
schemes_df = pd.read_csv(schemes_file)

# Preview datasets
print("Data.csv Head:")
print(data_df.head())
print("\nSchemes.csv Head:")
print(schemes_df.head())

# Step 3: Handle Missing Data
# Replace missing values with 0 in the patient counts and fill schemes-related missing values with 'Unknown'
data_df.fillna(0, inplace=True)
schemes_df.fillna("Unknown", inplace=True)

# Step 4: Reshape Monthly Data
# Melt the wide format to long format for easier trend analysis
date_columns = [col for col in data_df.columns if '-' in col]
melted_data = data_df.melt(
    id_vars=['Category', 'Category Name', 'Data Code', 'Data Name'],
    value_vars=date_columns,
    var_name='Date',
    value_name='Patient Count'
)

# Convert 'Date' to datetime for time-series processing
melted_data['Date'] = pd.to_datetime(melted_data['Date'], format='%d-%m-%Y')

# Step 5: Normalize Data
# Normalizing Patient Count for each category using Min-Max Scaling
melted_data['Normalized Patient Count'] = melted_data.groupby('Category')['Patient Count'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

# Step 6: Merge Datasets
# Merge data and schemes on Category and Category Name
merged_df = pd.merge(melted_data, schemes_df, on=['Category', 'Category Name'], how='left')

# Step 7: Tokenize Text Data
# Ensure text data like 'Description', 'Benefits', and 'Flaws' are cleaned and tokenized
merged_df['Description'] = merged_df['Description'].str.lower().str.replace(r'\W+', ' ', regex=True)
merged_df['Benefits'] = merged_df['Benefits'].str.lower().str.replace(r'\W+', ' ', regex=True)
merged_df['Flaws'] = merged_df['Flaws'].str.lower().str.replace(r'\W+', ' ', regex=True)

# Step 8: Save Cleaned Data
# Save processed data for use in training
melted_data.to_csv("../dataset/Cleaned_Trend_Data.csv", index=False)
merged_df.to_csv("../dataset/Merged_Schemes_Data.csv", index=False)

print("Preprocessing Complete: Cleaned files saved.")


Data.csv Head:
  Category          Category Name Data Code  \
0       M1  Ante Natal Care (ANC)       1.1   
1       M1  Ante Natal Care (ANC)     1.1.a   
2       M1  Ante Natal Care (ANC)     1.1.b   
3       M1  Ante Natal Care (ANC)     1.1.c   
4       M1  Ante Natal Care (ANC)     1.1.d   

                                           Data Name  01-04-2023  01-05-2023  \
0  Total number of NEW Pregnant Women registered ...         0.0         0.0   
1  Out of total number of NEW Pregnant Women regi...         0.0         0.0   
2  Out of total number of NEW Pregnant Women regi...         0.0         0.0   
3  Out of total number of NEW Pregnant Women regi...         0.0         0.0   
4  Out of total number of NEW Pregnant Women regi...         0.0         0.0   

   01-06-2023  01-07-2023  01-08-2023  01-09-2023  ...  01-01-2024  \
0         0.0         0.0         0.0         0.0  ...         0.0   
1         0.0         0.0         0.0         0.0  ...         0.0   
2         0