In [12]:
import pandas as pd

# Load the datasets

schemes_df = pd.read_csv('../dataset/schemes.csv')
diseases_df = pd.read_csv('../dataset/  diseases.csv')
# Display the first few rows of each dataframe to understand their structure
print("Diseases DataFrame:")
print(diseases_df.head())
print("\nSchemes DataFrame:")
print(schemes_df.head())

# Preprocess the diseases dataset
# Drop duplicates
diseases_df.drop_duplicates(inplace=True)

# Handle missing values (example: fill with 'Unknown' or drop rows/columns)
diseases_df.fillna('Unknown', inplace=True)

# Convert columns to appropriate data types if necessary
# Example: diseases_df['column_name'] = diseases_df['column_name'].astype('desired_type')

# Preprocess the schemes dataset
# Drop duplicates
schemes_df.drop_duplicates(inplace=True)

# Handle missing values (example: fill with 'Unknown' or drop rows/columns)
schemes_df.fillna('Unknown', inplace=True)

# Convert columns to appropriate data types if necessary
# Example: schemes_df['column_name'] = schemes_df['column_name'].astype('desired_type')

# Display the cleaned dataframes
print("\nCleaned Diseases DataFrame:")
print(diseases_df.head())

print("\nCleaned Schemes DataFrame:")
print(schemes_df.head())

Diseases DataFrame:
                     Category Data Code  \
0  M1 [Ante Natal Care (ANC)]       1.1   
1  M1 [Ante Natal Care (ANC)]     1.1.a   
2  M1 [Ante Natal Care (ANC)]     1.1.b   
3  M1 [Ante Natal Care (ANC)]     1.1.c   
4  M1 [Ante Natal Care (ANC)]     1.1.d   

                                           Data Name  01-04-2023  01-05-2023  \
0  Total number of NEW Pregnant Women registered ...         0.0         0.0   
1  Out of total number of NEW Pregnant Women regi...         0.0         0.0   
2  Out of total number of NEW Pregnant Women regi...         0.0         0.0   
3  Out of total number of NEW Pregnant Women regi...         0.0         0.0   
4  Out of total number of NEW Pregnant Women regi...         0.0         0.0   

   01-06-2023  01-07-2023  01-08-2023  01-09-2023  01-10-2023  ...  \
0         0.0         0.0         0.0         0.0         0.0  ...   
1         0.0         0.0         0.0         0.0         0.0  ...   
2         0.0         0.0     

  diseases_df.fillna('Unknown', inplace=True)


In [None]:
# Path: scripts/data_processing.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler


In [15]:
# Load both datasets
data_df = pd.read_csv('../dataset/data.csv')
schemes_df = pd.read_csv('../dataset/schemes.csv')


**Cleaning Data.csv**

In [16]:
# Check for missing values
print(data_df.isnull().sum())

# Fill or remove missing values
data_df.fillna(0, inplace=True)


Category       0
Data Code      0
Data Name      0
01-04-2023    71
01-05-2023    71
01-06-2023    71
01-07-2023    71
01-08-2023    71
01-09-2023    72
01-10-2023    71
01-11-2023    71
01-12-2023    71
01-01-2024    72
01-02-2024    72
01-03-2024    71
01-04-2024    71
01-05-2024    71
01-06-2024    71
01-07-2024    71
01-08-2024    71
01-09-2024    71
01-10-2024    71
dtype: int64


In [17]:
# Melt the Data.csv file to long format for better analysis
data_df_long = pd.melt(data_df,
                       id_vars=['Category', 'Data Code', 'Data Name'],
                       var_name='Date',
                       value_name='Patient Count')

# Convert 'Date' column to datetime
data_df_long['Date'] = pd.to_datetime(data_df_long['Date'], format='%d-%m-%Y')

# Sort by category and date for easier trend analysis
data_df_long.sort_values(by=['Category', 'Date'], inplace=True)


In [18]:
scaler = MinMaxScaler()
data_df_long['Normalized Patient Count'] = scaler.fit_transform(data_df_long[['Patient Count']])


**Cleaning Schemes.csv data**

In [19]:
# Check for missing values
print(schemes_df.isnull().sum())

# Fill or remove missing values
schemes_df.fillna('Unknown', inplace=True)


Category       0
Scheme Name    0
Description    0
Benefits       0
Flaws          0
Level          0
dtype: int64


In [20]:
# Standardize category names in both datasets for consistency
data_df_long['Category'] = data_df_long['Category'].str.strip().str.lower()
schemes_df['Category'] = schemes_df['Category'].str.strip().str.lower()


In [21]:
import re

def clean_text(text):
    # Remove special characters and multiple spaces
    text = re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text)).strip()
    return text.lower()

# Apply text cleaning
schemes_df['Description'] = schemes_df['Description'].apply(clean_text)
schemes_df['Benefits'] = schemes_df['Benefits'].apply(clean_text)
schemes_df['Flaws'] = schemes_df['Flaws'].apply(clean_text)


In [22]:
# Merge based on category
merged_df = pd.merge(data_df_long, schemes_df, on='Category', how='inner')

# Ensure merged data is clean
print(merged_df.isnull().sum())


Category                    0
Data Code                   0
Data Name                   0
Date                        0
Patient Count               0
Normalized Patient Count    0
Scheme Name                 0
Description                 0
Benefits                    0
Flaws                       0
Level                       0
dtype: int64


In [23]:
# Aggregate by month and category
trend_data = merged_df.groupby(['Category', 'Date']).agg({
    'Patient Count': 'sum',
    'Normalized Patient Count': 'mean'
}).reset_index()

# Example: Calculate percentage change for trends
trend_data['Trend Change'] = trend_data.groupby('Category')['Patient Count'].pct_change()


In [24]:
# Save data for further analysis or model training
data_df_long.to_csv('../dataset/processed_data.csv', index=False)
schemes_df.to_csv('../dataset/processed_schemes.csv', index=False)
merged_df.to_csv('../dataset/merged_data.csv', index=False)
trend_data.to_csv('../dataset/trend_analysis.csv', index=False)
