## Setup

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Reading the Data

In [None]:
enc = pd.read_csv('Encounter data without provider notes (3).csv', low_memory = False)
enc.head()

## Data Cleaning

### Changing the column keys to be the column names

In [None]:
enc.rename(columns = enc.loc[0], inplace = True)
enc.drop(0, axis = 0, inplace = True)

In [None]:
enc.info()

In [None]:
enc.columns

### Editing the icd9 column

In [None]:
# changing the dashes in the icd9 column to be nan values for consistency
enc['icd9encounterdiagdescr'] = enc['icd9encounterdiagdescr'].replace('-', np.nan)
enc.head()

### Fixing merge issue

Merge issue: there are multiple rows for each patient encounter based on the number of diagnoses the particular patient has. This was found due to there being a large amount of rows for particular patients.

In [None]:
# condensing diagnosis rows to a list of diagnoses for each encounter
enc_icd10 = enc[enc['icd10encounterdiagdescr'].notnull()].groupby(['patientid','cln enc date'])['icd10encounterdiagdescr'].apply(set).reset_index()
enc_icd9 = enc[enc['icd9encounterdiagdescr'].notnull()].groupby(['patientid','cln enc date'])['icd9encounterdiagdescr'].apply(set).reset_index()
enc_codedesc = enc[enc['patientsnomedproblemcodedesc'].notnull()].groupby(['patientid','cln enc date'])['patientsnomedproblemcodedesc'].apply(set).reset_index()
enc_diag = enc[enc['enc srv diag'].notnull()].groupby(['patientid','cln enc date'])['enc srv diag'].apply(set).reset_index()

# removing duplicate to make sure there is only one row per patient encounter
enc1 = enc.drop_duplicates(['patientid', 'cln enc date']).reset_index()

# dropping diagnosis columns from dataset without duplicates
enc1.drop('icd10encounterdiagdescr', axis = 1, inplace = True)
enc1.drop('icd9encounterdiagdescr', axis = 1, inplace = True)
enc1.drop('patientsnomedproblemcodedesc', axis = 1, inplace = True)
enc1.drop('enc srv diag', axis = 1, inplace = True)

# merging condensed diagnosis columns
cond = pd.merge(enc1, enc_icd10, on = ['patientid', 'cln enc date'], how = 'left')
cond = pd.merge(cond, enc_icd9, on = ['patientid', 'cln enc date'], how = 'left')
cond = pd.merge(cond, enc_codedesc, on = ['patientid', 'cln enc date'], how = 'left')
cond = pd.merge(cond, enc_diag, on = ['patientid', 'cln enc date'], how = 'left')

In [None]:
# making sure no data was lost by checking the shape of both datasets.
# they should have the same number of rows and cond should have an additional 4 diagnoses columns

print(enc1.shape)
print(cond.shape)

### Fixing mixed datatypes

In [None]:
cond.info()

In [None]:
cond.head()

In [None]:
# checking to see which columns have mixed datatypes

from pandas.api.types import infer_dtype

columns = cond.columns
for col in columns:
    print(col + ' - ' + infer_dtype(cond[col]))

In [None]:
# making the patientid and zip code columns all strings

cond['patientid'] = [str(x) for x in cond['patientid']]
cond['patient zip'] = [str(x) for x in cond['patient zip']]

In [None]:
# replace incorrect zip code entry
cond['patient zip'] = cond['patient zip'].replace(['2472'], '37184')
cond['patient zip'] = cond['patient zip'].replace('37355-1424', '37355')

# replace incorrect city entry
cond['patient city'] = cond['patient city'].replace(['TULLAHOMATULLAHOMA'], 'TULLAHOMA')
cond['patient city'] = cond['patient city'].replace(['MANCH'], 'MONTEAGLE')

In [None]:
# changing the nan values in the patient federal poverty level to be the average federal poverty level

# first making sure that all non-null values are numerical
cond['ptnt  fpl'] = [float(x) for x in cond['ptnt  fpl']]

# finding the mean of the federal poverty levels
mean_fpl = cond['ptnt  fpl'].mean()

# filling all null values with the mean
cond['ptnt  fpl'] = cond['ptnt  fpl'].fillna(mean_fpl)

In [None]:
# changing null values for the patient registration date, patient reason for inactive status,
#    race, ethnicity, patient lang and outgoing referral columns to be unknown

cond['patientregd'] = cond['patientregd'].fillna('unknown')
cond['ptnt rsn fr nctv stts'] = cond['ptnt rsn fr nctv stts'].fillna('unspecified')
cond['race'] = cond['race'].fillna('unspecified')
cond['ethnicity'] = cond['ethnicity'].fillna('unspecified')
cond['patient lang'] = cond['patient lang'].fillna('unspecified')

# if a value is null in the patient deceased column then they are still alive ??
cond['ptnt dcsd ysn'] = cond['ptnt dcsd ysn'].fillna('still alive')

# if a value is null in the outgoing referral column the patient has not gotten a referral ??
cond['auth refto prvdr'] = cond['auth refto prvdr'].fillna('no referral')

In [None]:
# filling in null patient county of residence values based on patient city

# all patients with null counties live in Watertown which is in Wilson county
cond[cond['ptnt cnty f rsdnc'].isnull()]['patient city'].value_counts()

# filling all null counties with Wilson county
cond['ptnt cnty f rsdnc'] = cond['ptnt cnty f rsdnc'].fillna('Wilson')

In [None]:
cond.info()

In [None]:
# fixing null values for the diagnosis columns (icd9, problem description, srv diagnosis)

# NOTE: will do this once we talk to Emilie

### Adding columns

In [None]:
# CREATING THE AGE COL
## description: a column that represents the patients age at the time of the clinic encounter

from datetime import datetime

# Format according to datetime module
dob = pd.to_datetime(cond['patientdob'], format='%m/%d/%Y')
encdate = pd.to_datetime(cond['cln enc date'], format='%m/%d/%Y')

# Calculate the age in days
age_days = (encdate - dob).dt.days

# Convert age from days to years
age_years = age_days // 365.25

# Create the new column and make the ages ints
cond['age'] = [int(x) for x in age_years]

# creating year column

In [None]:
# CREATING THE YEAR COL
## description: a column that shows what year the patient encounter took place

# function that returns the year from a date format of 'mm/dd/year'
def dayToYear(day):
    return day.split('/')[-1]

# creating the new column
cond['enc year'] = [dayToYear(day) for day in cond['cln enc date']]

In [None]:
import pandas as pd
# Create 'year' column from 'patientregd' column
cond['year'] = pd.to_datetime(cond['patientregd']).dt.year


# distance

In [None]:
# CREATING THE DISTANCE COL
## description: a column that represents the distance between patients and the clinic (calculated by zip code)

import pgeocode

# function that gets the distance between two zip codes using the pgeocode package
def get_distance(x, y):
    usa_zipcodes = pgeocode.GeoDistance('us')
    distance_in_kms = usa_zipcodes.query_postal_code(x, y.values)
    return distance_in_kms

# creating the new column
cond['distance'] = get_distance('37388', cond['patient zip'])

In [None]:
# Convert distance from km to miles
patients['distance'] = patients['distance'] * 0.621371


In [None]:
patients.head()

In [None]:
patients = cond.groupby('patientid').agg({
    'patientsex': 'first',
    'age': 'first',
    'patientdob': 'first',
    'patientregd': 'first',
    'status': 'first',
    'ptnt rsn fr nctv stts': 'first',
    'ptnt dcsd ysn': 'first',
    'ptnt  fpl': 'first',
    'patient city': 'first',
    'patient zip': 'first',
    'ptnt cnty f rsdnc': 'first',
    'race': 'first',
    'ethnicity': 'first',
    'patient lang': 'first',
    'appttype': 'first',
    'prvdr': 'first',
    'icd10encounterdiagdescr': 'first',
    'icd9encounterdiagdescr': 'first',
    'enc srv diag': 'first',
    'patientsnomedproblemcodedesc': 'first','distance': 'first'
}).reset_index()

In [None]:
# Convert 'patientregd' column to datetime type, handling errors by converting invalid dates to NaT
patients['patientregd'] = pd.to_datetime(patients['patientregd'], errors='coerce')

# Extract the year from the 'patientregd' column and create a new 'year' column
patients['year'] = patients['patientregd'].dt.year

In [None]:
patients = cond.groupby('patientid').agg({
    'patientsex': 'first',
    'age': 'first',
    'patientdob': 'first',
    'patientregd': 'first',
    'status': 'first',
    'ptnt rsn fr nctv stts': 'first',
    'ptnt dcsd ysn': 'first',
    'ptnt  fpl': 'first',
    'patient city': 'first',
    'patient zip': 'first',
    'ptnt cnty f rsdnc': 'first',
    'race': 'first',
    'ethnicity': 'first',
    'patient lang': 'first',
    'appttype': 'first',
    'prvdr': 'first',
    'icd10encounterdiagdescr': 'first',
    'icd9encounterdiagdescr': 'first',
    'enc srv diag': 'first',
    'patientsnomedproblemcodedesc': 'first','distance'
}).reset_index()

In [None]:
# Convert 'patientregd' column to datetime type, handling errors by converting invalid dates to NaT
patients['patientregd'] = pd.to_datetime(patients['patientregd'], errors='coerce')

# Extract the year from the 'patientregd' column and create a new 'year' column
patients['year'] = patients['patientregd'].dt.year

In [None]:

# Merge the 'patients' DataFrame with the 'cond' DataFrame on the 'patientid' column
patients = pd.merge(patients, cond[['patientid', 'distance']], on='patientid', how='left')


In [None]:
patients.shape

In [None]:
patients.drop(patients[patients['distance'] > 100].index, inplace=True)

In [None]:
import pandas as pd
import plotly.express as px

# Assuming 'patients' is your dataset, if it's different, please replace it accordingly
# For example: patients = pd.read_csv('your_dataset.csv')

# Convert 'patientregd' column to datetime type
patients['patientregd'] = pd.to_datetime(patients['patientregd'])

# Extract the year from the 'patientregd' column and create a new 'registration_year' column
patients['registration_year'] = patients['patientregd'].dt.year

# Count the number of registrations for each year
clinic_usage = patients['registration_year'].value_counts().reset_index()
clinic_usage.columns = ['Year', 'Number of Registrations']

# Sort the data by year
clinic_usage = clinic_usage.sort_values('Year')

# Create the line graph
fig = px.line(clinic_usage, x='Year', y='Number of Registrations', title='Clinic Usage Over Years',
              labels={'Year': 'Year', 'Number of Registrations': 'Number of Registrations'})

fig.show()


In [None]:
patients.shape

In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

import plotly.offline as pyo
import plotly.io as pio
pio.renderers.default = 'notebook'
pyo.init_notebook_mode(connected=True)

def plot_gender_distribution(year):
    filtered_data = patients[patients['year'] == year]
    gender_counts = filtered_data['patientsex'].value_counts()
    fig = px.pie(values=gender_counts, names=gender_counts.index, title=f"Gender Distribution for Year {year}")
    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

widgets.interactive(plot_gender_distribution, year=year_slider)


In [None]:
# Create a function to plot the interactive histogram
def plot_top_10_diagnoses(year):
    # Filter the DataFrame based on the selected year
    filtered_data = patients[patients['year'] == year]

    # Create a list of all snomed diagnoses for the selected year
    diagnosesRep = []
    for dSet in filtered_data['patientsnomedproblemcodedesc']:
        if type(dSet) == set:
            for diagnosis in dSet:
                diagnosesRep.append(diagnosis)
    
    # Create a set to get a list of unique diagnoses with no duplicates
    diagnoses = set(diagnosesRep)
    
    # Create a dictionary between the snomed code and the diagnosis
    diagnoses_dict = {}
    for diag in diagnoses:
        diagnoses_dict[diag.split(': ')[1]] = diag.split(':')[0]
    
    # Create a dictionary between each diagnosis and the number of patients that have the diagnosis
    diagnoses_counts = {}
    for diag in diagnoses:
        diagnoses_counts[diag] = 0
    for diag in diagnoses:
        for patient_diag in filtered_data['patientsnomedproblemcodedesc']:
            if type(patient_diag) == set and diag in patient_diag:
                diagnoses_counts[diag] += 1

    # Sort the dictionary from most to least frequent diagnosis
    def value_getter(item):
        return item[1]
            
    popular_diag = sorted(diagnoses_counts.items(), key=value_getter, reverse=True)[:10]

    # Create data for the histogram
    data = pd.DataFrame({'Diagnosis': [diag.split(': ')[1] for diag, count in popular_diag],
                         'Number of Patients': [count for diag, count in popular_diag]})

    # Create an interactive histogram
    fig = px.bar(data, x='Diagnosis', y='Number of Patients',
                 labels={'Diagnosis': 'Diagnosis', 'Number of Patients': 'Number of Patients'},
                 title=f'Top 10 Diagnoses for Year {year}')

    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

# Link the slider to the plot function

widgets.interactive(plot_top_10_diagnoses, year=year_slider)


In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display


In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

import plotly.offline as pyo
import plotly.io as pio
pio.renderers.default = 'notebook'
pyo.init_notebook_mode(connected=True)
# Create a function to plot the top 10 conditions
def plot_top_10_conditions(year):
    # Filter the DataFrame based on the selected year
    filtered_data = cond[cond['enc year'] == year]


    # Group the data by condition and count the unique patients for each condition
    conditions_counts = filtered_data.groupby(['patientsnomedproblemcodedesc', 'patientid']).count()
    conditions_counts = conditions_counts.sort_values(ascending=False)[:10]

    # Create data for the plot
    data = pd.DataFrame({'Condition': conditions_counts.index,
                         'Number of Patients': conditions_counts.values})

    # Create an interactive bar plot
    fig = px.bar(data, x='Condition', y='Number of Patients',
                 labels={'Condition': 'Condition', 'Number of Patients': 'Number of Patients'},
                 title=f'Top 10 Conditions for Year {year}')

    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=cond['enc year'].min(), max=cond['enc year'].max(),
                                step=1, value=cond['enc year'].min(),
                                description='Year')

# Link the slider to the plot function
widgets.interactive(plot_top_10_conditions, year=year_slider)


In [None]:
patients.head()

In [None]:
cond.head()

In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

# Assuming 'patients' is your dataset, if it's different, please replace it accordingly
# For example: patients = pd.read_csv('your_dataset.csv')

# Create a function to plot the interactive histogram
def plot_ethnicity_distribution(year):
    # Filter the DataFrame based on the selected year
    filtered_data = patients[patients['year'] == year]

    # Create a dictionary between each ethnicity and the number of patients with that ethnicity
    ethnicity_counts = filtered_data['ethnicity'].value_counts()

    # Create data for the histogram
    data = pd.DataFrame({'Ethnicity': ethnicity_counts.index,
                         'Number of Patients': ethnicity_counts.values})

    # Create an interactive histogram
    fig = px.bar(data, x='Ethnicity', y='Number of Patients',
                 labels={'Ethnicity': 'Ethnicity', 'Number of Patients': 'Number of Patients'},
                 title=f'Patients Ethnicity Distribution for Year {year}')

    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

# Link the slider to the plot function
widgets.interactive(plot_ethnicity_distribution, year=year_slider)


In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

def plot_age_distribution(year):
    # Filter the DataFrame based on the selected year
    filtered_data = patients[patients['year'] == year]

    # Create an interactive histogram for patient age
    fig = px.histogram(filtered_data, x='age', nbins=20, title=f'Age Distribution for Year {year}',
                       labels={'age': 'Age'}, width=600)

    fig.update_layout(showlegend=False)

    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

# Link the slider to the plot function
widgets.interactive(plot_age_distribution, year=year_slider)


In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

# Assuming 'patients' is your dataset, if it's different, please replace it accordingly
# For example: patients = pd.read_csv('your_dataset.csv')

# Create a function to plot the interactive line graph
def plot_distance_line_graph(year):
    # Filter the DataFrame based on the selected year
    filtered_data = patients[patients['year'] == year]

    # Group the data by distance and calculate the count of patients
    grouped_data = filtered_data.groupby('distance').size().reset_index(name='count')

    # Create a line graph for the count of patients and distance
    fig = px.line(grouped_data, x='distance', y='count',
                  labels={'distance': 'Distance Traveled', 'count': 'Patient Count'},
                  title=f'Patient Count vs Distance Traveled for Year {year}')

    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

# Link the slider to the plot function
widgets.interactive(plot_distance_line_graph, year=year_slider)


In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

# Assuming 'patients' is your dataset, if it's different, please replace it accordingly
# For example: patients = pd.read_csv('your_dataset.csv')

# Create a function to plot the interactive histogram
def plot_distance_histogram(year):
    # Filter the DataFrame based on the selected year
    filtered_data = patients[patients['year'] == year]

    # Create a histogram for the distance traveled
    fig = px.histogram(filtered_data, x='distance', nbins=20,
                       labels={'distance': 'Distance', 'count': 'Patient Count'},
                       title=f'Distance Traveled in Miles {year}')

    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

# Link the slider to the plot function
widgets.interactive(plot_distance_histogram, year=year_slider)


In [None]:
pip install plotly

In [None]:
pip install ipywidgets

In [None]:
patients.head()

In [None]:

# Merge the 'patients' DataFrame with the 'cond' DataFrame on the 'patientid' column
patients = pd.merge(patients, cond[['patientid', 'distance']], on='patientid', how='left')


In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

# Assuming 'patients' is your dataset, if it's different, please replace it accordingly
# For example: patients = pd.read_csv('your_dataset.csv')

# Create a function to plot the interactive graph
def plot_distance_traveled(year):
    # Filter the DataFrame based on the selected year
    filtered_data = patients[patients['year'] == year]

    # Create an interactive scatter plot for distance traveled
    fig = px.scatter(filtered_data, x='patientid', y='distance', color='patientsex', size='age',
                     labels={'distance': 'Distance Traveled', 'patientid': 'Patient ID', 'age': 'Age'},
                     color_discrete_sequence=px.colors.qualitative.Plotly,
                     title=f'Distance Traveled by Each Patient for Year {year}')

    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

# Link the slider to the plot function
widgets.interactive(plot_distance_traveled, year=year_slider)


In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

# Assuming 'patients' is your dataset, if it's different, please replace it accordingly
# For example: patients = pd.read_csv('your_dataset.csv')

# Create a function to plot the interactive scatter plot
def plot_distance_vs_age(year):
    # Filter the DataFrame based on the selected year
    filtered_data = patients[patients['year'] == year].copy()

    # Define the age ranges
    age_ranges = [(18, 25), (26, 34), (35, 44), (45, 54), (55, 64), (71, float('inf'))]

    # Assign age range labels based on age values
    filtered_data['age_range'] = ''
    for i, (start, end) in enumerate(age_ranges):
        filtered_data.loc[(filtered_data['age'] >= start) & (filtered_data['age'] <= end), 'age_range'] = f'{start}-{end}'

    # Plot the interactive scatter plot
    fig = px.scatter(filtered_data, x='age_range', y='distance', color='age_range',
                     labels={'age_range': 'Age Range', 'distance': 'Distance'},
                     title=f'Relationship Between Distance and Age Range (Year {year})')

    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

# Link the slider to the plot function
widgets.interactive(plot_distance_vs_age, year=year_slider)


In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

# Assuming 'patients' is your dataset, if it's different, please replace it accordingly
# For example: patients = pd.read_csv('your_dataset.csv')

# Create a function to plot the interactive bar plot
def plot_distance_vs_age(year):
    # Filter the DataFrame based on the selected year
    filtered_data = patients[patients['year'] == year].copy()

    # Define the age ranges
    # Define the age ranges
    age_ranges = [(0,17),(18, 25), (26, 35), (36, 50), (51, 70), (71, float('inf'))]

    # Assign age range labels based on age values
    filtered_data['age_range'] = ''
    for i, (start, end) in enumerate(age_ranges):
        filtered_data.loc[(filtered_data['age'] >= start) & (filtered_data['age'] <= end), 'age_range'] = f'{start}-{end}'

    # Calculate the average distance for each age range
    avg_distance = filtered_data.groupby('age_range')['distance'].mean().reset_index()

    # Plot the interactive bar plot
    fig = px.bar(avg_distance, x='age_range', y='distance',
                 labels={'age_range': 'Age Range', 'distance': 'Average Distance'},
                 title=f'Average Distance by Age Range (Year {year})')

    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

# Link the slider to the plot function
widgets.interactive(plot_distance_vs_age, year=year_slider)


In [None]:
min_age = patients['age'].min()
print (min_age)

In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

import plotly.offline as pyo
import plotly.io as pio
pio.renderers.default = 'notebook'
pyo.init_notebook_mode(connected=True)

def plot_gender_distribution(year):
    filtered_data = patients[patients['year'] == year]
    gender_counts = filtered_data['patientsex'].value_counts().reset_index()
    gender_counts.columns = ['Gender', 'Count']
    fig = px.bar(gender_counts, x='Gender', y='Count', title=f"Gender Distribution for Year {year}")
    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

widgets.interactive(plot_gender_distribution, year=year_slider)


In [None]:
patients.shape

In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

# Assuming 'patients' is your dataset, if it's different, please replace it accordingly
# For example: patients = pd.read_csv('your_dataset.csv')

# Drop rows with missing registration years
patients_cleaned = patients.dropna(subset=['registration_year'])

# Define the age ranges
age_ranges = [(0, 17), (18, 25), (26, 35), (36, 50), (51, 70), (71, float('inf'))]

# Create a function to plot the interactive count plot
def plot_age_count(year):
    filtered_data = patients_cleaned[patients_cleaned['registration_year'] == year].copy()

    # Assign age range labels based on age values
    filtered_data['age_range'] = pd.cut(filtered_data['age'], bins=[x[0] for x in age_ranges] + [age_ranges[-1][1] + 1],
                                        labels=['0-17', '18-25', '26-35', '36-50', '51-70', '71+'])

    fig = px.histogram(filtered_data, x='age_range', title=f"Age Range Count for Year {year}")
    fig.update_layout(bargap=0.1)
    fig.show()

# Get the unique registration years from the cleaned dataset
years = patients_cleaned['registration_year'].unique()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=int(years.min()), max=int(years.max()), step=1, value=int(years.min()), description='Year')

# Link the slider widget to the plot function
widgets.interactive(plot_age_count, year=year_slider)


In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

def plot_top_10_diagnoses(year):
    # Filter the DataFrame based on the selected year
    filtered_data = patients[patients['year'] == year]

    # Create a list of all snomed diagnoses for the selected year
    diagnosesRep = []
    for dSet in filtered_data['patientsnomedproblemcodedesc']:
        if type(dSet) == set:
            for diagnosis in dSet:
                diagnosesRep.append(diagnosis)
    
    # Create a set to get a list of unique diagnoses with no duplicates
    diagnoses = set(diagnosesRep)
    
    # Create a dictionary between the snomed code and the diagnosis
    diagnoses_dict = {}
    for diag in diagnoses:
        diagnoses_dict[diag.split(': ')[1]] = diag.split(':')[0]
    
    # Create a dictionary between each diagnosis and the number of patients that have the diagnosis
    diagnoses_counts = {}
    for diag in diagnoses:
        diagnoses_counts[diag] = 0
    for diag in diagnoses:
        for patient_diag in filtered_data['patientsnomedproblemcodedesc']:
            if type(patient_diag) == set and diag in patient_diag:
                diagnoses_counts[diag] += 1

    # Sort the dictionary from most to least frequent diagnosis
    def value_getter(item):
        return item[1]
            
    popular_diag = sorted(diagnoses_counts.items(), key=value_getter, reverse=True)[:10]

    # Create data for the histogram
    data = pd.DataFrame({'Diagnosis': [diag.split(': ')[1] for diag, count in popular_diag],
                         'Number of Patients': [count for diag, count in popular_diag]})

    # Create an interactive histogram
    fig = px.bar(data, x='Diagnosis', y='Number of Patients',
                 labels={'Diagnosis': 'Diagnosis', 'Number of Patients': 'Number of Patients'},
                 title=f'Top 10 Diagnoses for Year {year}')
    
    # Add age range annotations to the plot
    age_ranges = [(0, 17), (18, 25), (26, 35), (36, 50), (51, 70), (71, float('inf'))]
    for i, age_range in enumerate(age_ranges):
        fig.add_shape(type="rect",
                      xref="x", yref="paper",
                      x0=i-0.5, x1=i+0.5,
                      y0=0, y1=1,
                      fillcolor="LightSalmon",
                      opacity=0.2,
                      layer="below"
                      )
        fig.add_annotation(text=str(age_range),
                           x=i, y=1.02,
                           showarrow=False,
                           font=dict(size=10),
                           xanchor='center', yanchor='bottom')
    
    fig.update_layout(yaxis=dict(title='Number of Patients'),
                      xaxis=dict(title='Diagnosis'),
                      bargap=0.2)
    fig.show()

# Create a slider widget to select the year
year_slider = widgets.IntSlider(min=patients['year'].min(), max=patients['year'].max(),
                                step=1, value=patients['year'].min(),
                                description='Year')

# Link the slider to the plot function
widgets.interactive(plot_top_10_diagnoses, year=year_slider)


In [None]:
import pandas as pd
import plotly.express as px

# Assuming 'patients' is your DataFrame and 'patientsnomedproblemcodedesc' is the column name
# Get the top 10 diagnoses based on frequency in 'patientsnomedproblemcodedesc' column
top_10_diagnoses = patients['patientsnomedproblemcodedesc'].value_counts().nlargest(10).index

# Define the age ranges
age_ranges = [(0, 17), (18, 25), (26, 35), (36, 50), (51, 70), (71, float('inf'))]

# Create a new column 'Age Range' based on the age ranges
patients['Age Range'] = pd.cut(patients['Age'], bins=[range[0] - 0.5 for range in age_ranges] + [age_ranges[-1][1] + 0.5],
                         labels=['{}-{}'.format(range[0], range[1]) for range in age_ranges])

# Filter the DataFrame for the top 10 diagnoses
filtered_patients = patients[patients['patientsnomedproblemcodedesc'].isin(top_10_diagnoses)]

# Create an interactive visualization using Plotly
fig = px.histogram(filtered_patients, x='Age Range', color='patientsnomedproblemcodedesc',
                   title='Top 10 Diagnoses by Age Range',
                   labels={'Age Range': 'Age Range', 'count': 'Count'})

fig.update_layout(xaxis={'categoryorder': 'array', 'categoryarray': [range[0] for range in age_ranges]},
                  barmode='group')

fig.show()


In [None]:
patients.head()

In [None]:
pip install jupyter-dash dash

In [None]:
from jupyter_dash import JupyterDash
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

In [None]:
patients["distance"]

In [None]:
pip install ipywidgets

In [None]:
pip install plotly

In [None]:
top_10_diagnoses=['Hypertensive disorder',
 'Hyperlipidemia',
 'Diabetes mellitus',
 'Depressive disorder',
 'Hypercholesterolemia',
 'Anxiety',
 'Vitamin D deficiency',
 'Hypothyroidism',
 'Smokes tobacco daily',
 'Gastroesophageal reflux disease']

In [None]:
age_ranges = ['18-32', '33-48', '49-63']
enc['age'] = enc['age'].astype(int)
enc['age range'] = enc['age'].apply(lambda x: age_ranges[0] if x >= 18 and x <= 32 else (age_ranges[1] if x >= 33 and x <= 48 else (age_ranges[2] if x >= 49 and x <= 63 else 'other')))
df = enc[['patientsnomedproblemcodedesc', 'age range', 'patientid', 'enc year']]
df.dropna(inplace = True)
df1 = df.groupby(['patientsnomedproblemcodedesc', 'age range', 'enc year'])['patientid'].nunique().reset_index()
df1['patientsnomedproblemcodedesc'] = df1['patientsnomedproblemcodedesc'].apply(lambda diag: diag.split(': ')[1])
df1 = df1[df1['patientsnomedproblemcodedesc'].isin(top_10_diagnoses)]
df2=df1[df1['enc year']=='2019']
sns.barplot(data = df2, x = 'patientsnomedproblemcodedesc', y = 'patientid', hue = 'age range')
plt.xticks(rotation = 60, ha = 'right') 

In [None]:
enc.columns

In [None]:
# function that returns the year from a date format of 'mm/dd/year'
def dayToYear(day):
    return day.split('/')[-1]
# creating the new column
enc['enc year'] = [dayToYear(day) for day in enc['cln enc date']]

In [None]:
import pandas as pd
import datetime

# Assuming your DataFrame is called 'enc'
enc['patientdob'] = pd.to_datetime(enc['patientdob'])  # Convert 'patientdob' to datetime if not already
enc['age'] = (datetime.datetime.now() - enc['patientdob']) // pd.Timedelta(days=365.25)  # Calculate age in years


In [None]:
enc.head()


In [None]:
# function that returns the year from a date format of 'mm/dd/year'
def dayToYear(day):
    return day.split('/')[-1]
# creating the new column
enc['enc year'] = [dayToYear(day) for day in enc['cln enc date']]

In [None]:
import pandas as pd
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display

# Assuming you have 'enc' DataFrame containing the necessary columns

# Define age ranges
age_ranges = ['18-32', '33-48', '49-63']

# Convert 'age' column to integer
enc['age'] = enc['age'].astype(int)

# Assign age ranges based on 'age' column
enc['age range'] = enc['age'].apply(lambda x: age_ranges[0] if 18 <= x <= 32 else (
    age_ranges[1] if 33 <= x <= 48 else (
        age_ranges[2] if 49 <= x <= 63 else 'other')
    )
)

# Create a new DataFrame with selected columns
df = enc[['patientsnomedproblemcodedesc', 'age range', 'patientid', 'enc year']]

# Drop rows with missing values
df.dropna(inplace=True)

# Group the data and calculate unique patient count
df1 = df.groupby(['patientsnomedproblemcodedesc', 'age range', 'enc year'])['patientid'].nunique().reset_index()

# Extract the diagnosis description from 'patientsnomedproblemcodedesc' column
df1['patientsnomedproblemcodedesc'] = df1['patientsnomedproblemcodedesc'].apply(lambda diag: diag.split(': ')[1])


df1 = df1[df1['patientsnomedproblemcodedesc'].isin(top_10_diagnoses)]

# Remove NaN values from 'enc year' column
df1 = df1.dropna(subset=['enc year'])

# Get the available years for the slider
available_years = df1['enc year'].unique()

# Check if available_years is not empty
if available_years.size == 0:
    print("No data available for the selected diagnoses.")
else:
    # Create a slider widget for selecting the year
    year_slider = widgets.IntSlider(
        min=int(available_years.min()),
        max=int(available_years.max()),
        value=int(available_years.min()),
        description='Year:',
        continuous_update=False
    )

    # Create an empty figure
    fig = go.Figure()

    # Create an update function for the plot
    def update_plot(year):
        df_filtered = df1[df1['enc year'] == year]
        fig.data = []  # Clear previous data

        for age_range in age_ranges:
            df_age = df_filtered[df_filtered['age range'] == age_range]
            fig.add_trace(go.Bar(
                x=df_age['patientsnomedproblemcodedesc'],
                y=df_age['patientid'],
                name=age_range
            ))

        fig.update_layout(
            title=f"Diagnoses by Age Range ({year})",
            xaxis_title='Diagnosis',
            yaxis_title='Number of Patients',
            barmode='stack'
        )

    # Update the plot with the initial year
    update_plot(year_slider.value)

    # Define a callback function to update the plot when the slider value changes
    def on_slider_change(change):
        update_plot(change.new)

    # Set the callback function for the slider widget
    year_slider.observe(on_slider_change, names='value')

    # Display the widgets and the plot
    display(year_slider)
    fig.show()


In [None]:
enc.head()

In [None]:
import pandas as pd
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display

# Assuming you have 'enc' DataFrame containing the necessary columns

# Define age ranges
age_ranges = ['18-32', '33-48', '49-63']

# Convert 'age' column to integer
enc['age'] = enc['age'].astype(int)

# Assign age ranges based on 'age' column
enc['age range'] = enc['age'].apply(lambda x: age_ranges[0] if 18 <= x <= 32 else (
    age_ranges[1] if 33 <= x <= 48 else (
        age_ranges[2] if 49 <= x <= 63 else 'other')
    )
)

# Drop rows with missing values in the necessary columns
enc.dropna(subset=['patientsnomedproblemcodedesc', 'age range', 'patientid', 'enc year'], inplace=True)

# Group the data and calculate unique patient count
df1 = enc.groupby(['patientsnomedproblemcodedesc', 'age range', 'enc year'])['patientid'].nunique().reset_index()

# Extract the diagnosis description from 'patientsnomedproblemcodedesc' column
df1['patientsnomedproblemcodedesc'] = df1['patientsnomedproblemcodedesc'].apply(lambda diag: diag.split(': ')[1])

# Filter the DataFrame to include only the top 10 diagnoses


df1 = df1[df1['patientsnomedproblemcodedesc'].isin(top_10_diagnoses)]

# Remove NaN values from 'enc year' column
df1.dropna(subset=['enc year'], inplace=True)

# Get the available years for the slider
available_years = df1['enc year'].unique()

# Check if available_years is not empty
if available_years.size == 0:
    print("No data available for the selected diagnoses.")
else:
    # Create a slider widget for selecting the year
    year_slider = widgets.IntSlider(
        min=int(available_years.min()),
        max=int(available_years.max()),
        value=int(available_years.min()),
        description='Year:',
        continuous_update=False
    )

    # Create an empty figure
    fig = go.Figure()

    # Create an update function for the plot
    def update_plot(year):
        df_filtered = df1[df1['enc year'] == year]
        fig.data = []  # Clear previous data

        for age_range in age_ranges:
            df_age = df_filtered[df_filtered['age range'] == age_range]
            fig.add_trace(go.Bar(
                x=df_age['patientsnomedproblemcodedesc'],
                y=df_age['patientid'],
                name=age_range
            ))

        fig.update_layout(
            title=f"Diagnoses by Age Range ({year})",
            xaxis_title='Diagnosis',
            yaxis_title='Number of Patients',
            barmode='stack'
        )

    # Update the plot with the initial year
    update_plot(year_slider.value)

    # Define a callback function to update the plot when the slider value changes
    def on_slider_change(change):
        update_plot(change.new)

    # Set the callback function for the slider widget
    year_slider.observe(on_slider_change, names='value')

    # Display the widgets and the plot
    display(year_slider)
    fig.show()
