In [None]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# specify the path to save the intermediate data
primary_data_path = '..\\data\\primary\\primary_data.parquet'

# Save the intermediate data to a parquet file
primary_data = pd.read_parquet(primary_data_path)

# EDA To-Do List

## 1. Dataset Overview and Characteristics
- [ ] **Determine the number of features and observations**
    - Count the number of rows and columns in the dataset.
    - Check the types of features (categorical, numerical, etc.).
- [ ] **Identify missing values**
    - Count missing values in each column.
    - Analyze patterns or clusters of missing data.
- [ ] **Check for duplicate records**
    - Identify and remove duplicate rows if any exist.

---

## 2. Univariate Visualization and Feature Assessment
- [ ] **Descriptive statistics**
    - Calculate key metrics (mean, median, standard deviation, range, etc.) for numerical features.
    - Evaluate distributions (e.g., skewness, kurtosis).
- [ ] **Distribution and variation analysis**
    - Plot histograms, box plots, and density plots for numerical features.
    - Analyze the range, variation, and scale of each feature.
    - Identify common values and outliers.
- [ ] **Evaluate missing values**
    - Determine if missing values need imputation, removal, or augmentation.
- [ ] **Feature re-indexing and reformatting**
    - Assess the need for changing indices or reformatting features.
- [ ] **Operations such as data imputation or augmentation**
    - Document and apply imputation methods for missing values.
    - Explore data augmentation techniques if applicable.

---

## 3. Multivariate Visualization and Correlation Assessment
- [ ] **Patterns and relationships between features**
    - Plot pairwise relationships (e.g., scatter plots, pair plots) for key features.
    - Visualize heatmaps to assess correlations among numerical features.
- [ ] **Behavior of missing values**
    - Investigate if missing values are correlated with any feature or target variable.
- [ ] **Assess dimensionality reduction and feature selection**
    - Explore the need for dimensionality reduction techniques (e.g., PCA, t-SNE).
    - Identify key features for further modeling or analysis.

---

## Notes
- Ensure that all findings are documented with visualizations and tables where applicable.
- Use insights from EDA to guide further preprocessing, modeling, or reporting steps.

# Basic EDA: Descriptive Analysis


In [None]:
## 1. Function to Print DataFrame Information

def print_dataframe_info(df):
    """
    Prints various information about the DataFrame.

    Parameters:
    df (DataFrame): The DataFrame to print information about.
    """
    attributes_and_methods = [
        ("Shape", df.shape),
        ("Columns", df.columns),
        ("Info", df.info)
    ]

    for name, attribute_or_method in attributes_and_methods:
        print(f"--- {name} ---")
        if callable(attribute_or_method):
            attribute_or_method()  # Call the method if it's callable
        else:
            print(attribute_or_method)
        print("\n")

# Call the function with intermediate_data
print_dataframe_info(primary_data)


In [None]:
## 2. Numerical Variable Analysis: `age`, `bmi`

# Basic descriptive statistics for numerical variables
numerical_columns = ['age', 'bmi']
numerical_summary = primary_data[numerical_columns].describe()
print("Summary statistics for numerical variables:")
print(numerical_summary)

# Visualizations for numerical variables


for column in numerical_columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(primary_data[column], kde=True, bins=60, color='blue')
    plt.title(f"{column.capitalize()} Distribution")
    plt.xlabel(column.capitalize())
    plt.ylabel("Frequency")
    plt.show()
    plt.savefig(f'..\\reports\\{column}_distribution.png')

In [None]:
## 3. Categorical Variable Analysis: `gender`, `city`, `cms_score`, `icd_code`, `claim_type`
# Basic descriptive statistics for categorical variables
from tkinter import font


categorical_columns = ['gender', 'city', 'cms_score', 'icd_code','icd_description',  'claim_type']

for column in categorical_columns:
    print(f"--- {column.capitalize()} ---")
    value_counts = intermediate_data[column].value_counts()
    print(value_counts)

    # Visualization for categorical variables
    plt.figure(figsize=(18, 6))
    sns.countplot(data=intermediate_data, x=column, order=value_counts.index)
    plt.title(f"{column.capitalize()} Distribution")
    plt.xticks(rotation=90)
    plt.xlabel(column.capitalize(), fontsize=8)
    plt.ylabel("Count")
    plt.show()

# Advanced EDA: Further Exploration


In [None]:
## 1. Numerical Variable Analysis: `age`, `bmi`

# Expanded descriptive statistics for numerical variables
numerical_columns = ['age', 'bmi']
advanced_numerical_summary = intermediate_data[numerical_columns].describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])
print("Expanded summary statistics for numerical variables:")
print(advanced_numerical_summary)

# Boxplot and violin plot for numerical variables

for column in numerical_columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=intermediate_data[column], color='cyan')
    plt.title(f"{column.capitalize()} Box Plot")
    plt.xlabel(column.capitalize())
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.violinplot(x=intermediate_data[column], color='magenta')
    plt.title(f"{column.capitalize()} Violin Plot")
    plt.xlabel(column.capitalize())
    plt.show()

In [None]:
## 2. Categorical Variable Analysis: `city`, `cms_score`, `icd_code`, `claim_type`, `policy_number`, `member_code`

# Expanded categorical analysis with proportions
categorical_columns = ['gender','city', 'cms_score', 'icd_code', 'claim_type', ]

for column in categorical_columns:
    print(f"--- {column.capitalize()} Analysis ---")
    value_counts = intermediate_data[column].value_counts()
    proportions = intermediate_data[column].value_counts(normalize=True) * 100
    print(f"Value Counts:\n{value_counts}")
    print(f"Proportions (in %):\n{proportions}")

    # Advanced visualizations for categorical variables
    plt.figure(figsize=(18, 8))
    sns.barplot(x=value_counts.index, y=value_counts.values, palette="viridis")
    plt.title(f"{column.capitalize()} Count Distribution")
    plt.xticks(rotation=90)
    plt.xlabel(column.capitalize())
    plt.ylabel("Count")
    plt.show()

    plt.figure(figsize=(18, 8))
    sns.barplot(x=proportions.index, y=proportions.values, palette="coolwarm")
    plt.title(f"{column.capitalize()} Proportion Distribution")
    plt.xticks(rotation=90)
    plt.xlabel(column.capitalize())
    plt.ylabel("Proportion (%)")
    plt.show()

In [None]:
## 3. Correlation and Interaction Analysis

# Correlation analysis for numerical variables
correlation_matrix = intermediate_data[numerical_columns].corr()
print("Correlation matrix for numerical variables:")
print(correlation_matrix)

# Heatmap for correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

# Pairplot for interactions between numerical variables
sns.pairplot(intermediate_data[numerical_columns], diag_kind='kde', corner=True)
plt.show()

# EDA Bivariate analysis
- age_cat vs [bmi_cat, ICD_10, cms_score] 
- gender vs [bmi_cat, ICD_10, cms_score]
- bmi_cat vs [ICD_10, cms_score]
- age_cat by gender --> demographic graph
- age_cat by gender across city --> demographic graph for each city 


In [None]:
## 3. Age and BMI Group Statistics

# Analyze `age_cat` vs `bmi_cat`, `icd_code`, and `cms_score`
age_bmi_crosstab = pd.crosstab(intermediate_data['age_cat'], intermediate_data['bmi_cat'])
print("Age Category vs BMI Category:")
print(age_bmi_crosstab)

age_icd_crosstab = pd.crosstab(intermediate_data['age_cat'], intermediate_data['icd_code'])
print("Age Category vs ICD Code:")
print(age_icd_crosstab)

age_cms_summary = pd.crosstab(intermediate_data['age_cat'], intermediate_data['cms_score'])
print("Age Category vs CMS Score:")
print(age_cms_summary)

In [None]:
## 4. Gender-Based Analysis

# Analyze `gender` vs `bmi_cat`, `icd_code`, and `cms_score`
gender_bmi_crosstab = pd.crosstab(intermediate_data['gender'], intermediate_data['bmi_cat'])
print("Gender vs BMI Category:")
print(gender_bmi_crosstab)

gender_icd_crosstab = pd.crosstab(intermediate_data['gender'], intermediate_data['icd_code'])
print("Gender vs ICD Code:")
print(gender_icd_crosstab)

gender_cms_summary = pd.crosstab(intermediate_data['gender'], intermediate_data['cms_score'])
print("Gender vs Average CMS Score:")
print(gender_cms_summary)

In [None]:
## 5. City and Demographics Analysis

# Analyze `age_cat` by `gender`
age_gender_crosstab = pd.crosstab(intermediate_data['age_cat'], intermediate_data['gender'])
print("Age Category by Gender:")
print(age_gender_crosstab)

# Analyze `age_cat` by `gender` 
age_gender = intermediate_data.groupby([ 'age_cat', 'gender']).size().unstack(fill_value=0)
print("Age Category by Gender Across City:")
print(age_gender)

# Visualize demographics
age_gender.plot(kind='bar', stacked=True, figsize=(14, 8))
plt.title("Age Category by Gender Across Cities")
plt.xlabel("Age Category")
plt.ylabel("Count")
plt.legend(title="Age and Gender")
plt.show()

# Create a population pyramid by gender for the full dataset
population_pyramid = intermediate_data.groupby(['age_cat', 'gender']).size().unstack(fill_value=0)
population_pyramid['Male'] = -population_pyramid.get('M', 0)  # Negative values for males

# Plot the pyramid
plt.figure(figsize=(10, 8))
plt.barh(population_pyramid.index, population_pyramid['Male'], color='blue', label='Male')
plt.barh(population_pyramid.index, population_pyramid.get('F', 0), color='red', label='Female')
plt.title("Population Pyramid by Gender")
plt.xlabel("Population Count")
plt.ylabel("Age Category")
plt.legend()
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

# Create population pyramids by gender for each city
for city, city_data in intermediate_data.groupby('city'):
    city_pyramid = city_data.groupby(['age_cat', 'gender']).size().unstack(fill_value=0)
    city_pyramid['Male'] = -city_pyramid.get('M', 0)  # Negative values for males

    plt.figure(figsize=(10, 8))
    plt.barh(city_pyramid.index, city_pyramid['Male'], color='blue', label='Male')
    plt.barh(city_pyramid.index, city_pyramid.get('F', 0), color='red', label='Female')
    plt.title(f"Population Pyramid by Gender in {city}")
    plt.xlabel("Population Count")
    plt.ylabel("Age Category")
    plt.legend()
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.show()


# Similiarity Scores

in this section we will attempt to cluster patients based on their similiarty across the following variables [age, bmi, gender, city, icd_10 code]
- Ideally we would do this over a time period (last x months), however in this case we dont have this information
- In values like bmi we 