In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
import sqlite3
import json

In [None]:
from scipy import stats as ss
import statsmodels.api as sm

In [None]:
with open("../config.json") as f:
    config=json.load(f)

In [None]:
conn_object = sqlite3.connect(config[1]["database_path"])

In [None]:
final_data = pd.read_sql_query("select * from final_readmission_df", con=conn_object, 
parse_dates={"CLM_FROM_DT_INP": {"format": "%Y-%m-%d"},
        "CLM_THRU_DT_INP": {"format": "%Y-%m-%d"},
        "CLM_ADMSN_DT_INP": {"format": "%Y-%m-%d"},
        "NCH_BENE_DSCHRG_DT_INP": {"format": "%Y-%m-%d"},
        "Next_CLM_ADMSN_DT_INP": {"format": "%Y-%m-%d"},
        "CLM_FROM_DT_OUT": {"format": "%Y-%m-%d"},
        "CLM_THRU_DT_OUT": {"format": "%Y-%m-%d"},
        "BENE_BIRTH_DT":{"format": "%Y-%m-%d"}, 
        "BENE_DEATH_DT":{"format": "%Y-%m-%d"}})

# L2.Formulate 1 hypothesis on this sample data which you would like to test/potentially beneficial to know for targeted stakeholders to validate your solution.

> Hypothesis 1 : Readmission and patient gender are related

$$
H_{0} : \ Readmission\_within\_30days\_INP \ and \ BENE\_SEX\_IDENT\_CD \ are \ not \ related\\

H_{a} : \ Readmission\_within\_30days\_INP \ and \ BENE\_SEX\_IDENT\_CD \ are \ related\\
$$

In [None]:
Observed_df = pd.crosstab(final_data['BENE_SEX_IDENT_CD'], final_data['Readmission_within_30days_INP'])

In [None]:
chi2, p, dof, expected = ss.chi2_contingency(observed=Observed_df)

In [None]:
print(f"Chi2 statistics is {chi2} & p value is {p}")

$$
pvalue >= 0.05 --> Failed \ to \ Reject \ H_{0}
$$

# L3.Provide Summary Statistics and inferences about data using statistics.

In [None]:
categorical_features = [
    "BENE_SEX_IDENT_CD",
    "BENE_RACE_CD",
    "BENE_ESRD_IND",
    "SP_ALZHDMTA",
    "SP_CHF",
    "SP_CHRNKIDN",
    "SP_CNCR",
    "SP_COPD",
    "SP_DEPRESSN",
    "SP_DIABETES",
    "SP_ISCHMCHT",
    "SP_OSTEOPRS",
    "SP_RA_OA",
    "SP_STRKETIA",
    "BENE_STATE_COUNTY_CODE",
    "PRVDR_NUM_CAT_INP",
    "ADMTNG_ICD9_DGNS_CD_CAT_INP",
    "ICD9_DGNS_CD_1_CAT_INP",
    "ICD9_DGNS_CD_2_CAT_INP",
    "ICD9_DGNS_CD_3_CAT_INP",
    "ICD9_DGNS_CD_4_CAT_INP",
    "ICD9_DGNS_CD_5_CAT_INP",
    "ICD9_DGNS_CD_6_CAT_INP",
    "ICD9_DGNS_CD_7_CAT_INP",
    "ICD9_DGNS_CD_8_CAT_INP",
    "ICD9_DGNS_CD_9_CAT_INP",
    "ICD9_PRCDR_CD_1_CAT_INP",
    "PRVDR_NUM_CAT_OUT",
    "HCPCS_CD_1_CAT_OUT",
    "HCPCS_CD_2_CAT_OUT",
    "HCPCS_CD_3_CAT_OUT",
    "ICD9_DGNS_CD_1_CAT_OUT",
    "ICD9_DGNS_CD_2_CAT_OUT",
    "Readmission_within_30days_INP",
    # "BENE_AGE_CAT"
    # "AT_PHYSN_NPI_OUT",
    # "AT_PHYSN_NPI_INP",
    # "OP_PHYSN_NPI_INP"
]

In [None]:
final_data[categorical_features] = final_data[categorical_features].astype('category')

In [None]:
final_data.describe()

In [None]:
sns.pairplot(final_data.select_dtypes(include="number").iloc[ : , :4])

In [None]:
sns.pairplot(final_data.select_dtypes(include="number").iloc[ : , 4:9])

In [None]:
plt.figure(figsize=(20, 10))
sns.countplot(x="variable", hue="value", data=pd.melt(final_data.select_dtypes(include="category").iloc[ :, :14]))
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(20, 10))
sns.countplot(x="variable", hue="value", data=pd.melt(final_data.select_dtypes(include="category").iloc[ :, 17:20]))
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(20, 10))
sns.countplot(x="variable", hue="value", data=pd.melt(final_data.select_dtypes(include="category").iloc[ :, 20:24]))
plt.xticks(rotation=90)

In [None]:
final_data.select_dtypes(include="number").skew()

> Highly skewed data mostly on the right side

> Should use log transform or square root transform to reduce skewness

In [None]:
final_data.select_dtypes(include="number").iloc[ : , :4].boxplot(flierprops=dict(markerfacecolor='r', marker='s'))
plt.xticks(rotation=90)

In [None]:
final_data.select_dtypes(include="number").iloc[ : , [5, 7, 8, 9, 10]].boxplot(flierprops=dict(markerfacecolor='r', marker='s'))
plt.xticks(rotation=90)

In [None]:
final_data.select_dtypes(include="number").iloc[ : , [4, 6]].boxplot(flierprops=dict(markerfacecolor='r', marker='s'))
plt.xticks(rotation=90)

In [None]:
final_data.select_dtypes(include="number").iloc[ : , [11, 12, 14]].boxplot(flierprops=dict(markerfacecolor='r', marker='s'))
plt.xticks(rotation=90)

> Almost all the numerical columns have high outliers towards the Q3