In [1]:
new_column_names = {
    'Age at delivery': 'Age',
    'Marital status (0 = unmarried, 1 = married)': 'MaritalStatus',
    'Ethnicity (0 = not hispanic/latino, 1 = hispanic/latino)': 'Ethnicity',
    'Race (1 = American Indian, 2 = Asian, 3 = Black/African American, 4 = Caucasian/White, 5 = Pacific Islander, 6 = Mixed, 7 = Other': 'Race',
    'Primary Language': 'PrimaryLanguage',
    'Translation Service Utilized (0 = none, 1 = yes)': 'TranslationService',
    'PCP on file (0 = none, 1 = yes)': 'PCPonFile',
    'Insurance Coverage (0 = none, 1 = Medicaid/Medicare, 2 = Other)': 'InsuranceCoverage',
    'Substance Use (0 = none, 1 = any ICD in ICD chart)': 'SubstanceUse',
    'Educational Attainment (0 = none listed, 1 = less than high school, 2 = high school graduate, 3 = some college, 4 = Bachelor and above': 'Education',
    'Employment Status (0 = not listed, 1 = unemployed, 2 = employed)': 'Employment',
    'History of Prenatal Care (0 = none, if yes list how many visits) ': 'PrenatalCareHistory',
    'Maternal Pathologies (0 = none, If present list ICD)': 'MaternalPathologies',
    'Pregnancy Complications (0 = none, If present list ICD)': 'PregnancyComplications',
    'Parity (0 = nulliparous, 1 = 1 prior child, etc.)': 'Parity'
}

# Rename the columns

In [2]:
import pandas as pd

# read xlsx file

df_out_of_hospital = pd.read_excel('data.xlsx', sheet_name='UOHB Maternal Characteristics')
df_in_hospital = pd.read_excel('data.xlsx', sheet_name='IHB Maternal Characteristics')

# rename column to "Pregnancy Complications"
df_out_of_hospital.rename(columns=new_column_names, inplace=True)
df_in_hospital.rename(columns=new_column_names, inplace=True)

df_out_of_hospital['PregnancyComplicationsCount'] = df_out_of_hospital['PregnancyComplications'].apply(lambda x: len(str(x).split(',')) if x else 0)
df_in_hospital['PregnancyComplicationsCount'] = df_in_hospital['PregnancyComplications'].apply(lambda x: len(str(x).split(',')) if x else 0)

df_out_of_hospital['PregnancyComplications'] = df_out_of_hospital['PregnancyComplications'].apply(lambda x: 1 if x else 0)
df_in_hospital['PregnancyComplications'] = df_in_hospital['PregnancyComplications'].apply(lambda x: 1 if x else 0)

In [3]:
import pandas as pd

# read xlsx file

df_baby_out_of_hospital = pd.read_excel('data.xlsx', sheet_name='UOHB Neonates Characteristics')
df_baby_in_hospital = pd.read_excel('data.xlsx', sheet_name='IHB Neonates Characteristics')

# rename column to "Pregnancy Complications"
#df_baby_out_of_hospital.rename(columns=new_column_names, inplace=True)
#df_baby_in_hospital.rename(columns=new_column_names, inplace=True)

df = pd.concat([df_out_of_hospital, df_in_hospital], keys=['Out of Hospital', 'In Hospital'])
df.reset_index(inplace=True)
df.rename(columns={'level_0': 'OutofHospital'}, inplace=True)
df['OutofHospital'] = df['OutofHospital'].apply(lambda x: 1 if x == 'Out of Hospital' else 0)


In [4]:
df = pd.concat([df_in_hospital, df_out_of_hospital], keys=['Out of Hospital', 'In Hospital'])
df.reset_index(inplace=True)
df.rename(columns={'level_0': 'OutofHospital'}, inplace=True)
#df.Parity = df.Parity.apply(lambda x: f"{x} prior children" if x > 1 else f"No prior child")
df['OutofHospital'] = df['OutofHospital'].apply(lambda x: 1 if x == 'Out of Hospital' else 0)
mapping_dictionaries = {
    'MaritalStatus': {
        0: 'unmarried',
        1: 'married'
    },
    'Ethnicity': {
        0: 'not hispanic/latino',
        1: 'hispanic/latino'
    },
    'Race': {
        1: 'American Indian',
        2: 'Asian',
        3: 'Black/African American',
        4: 'Caucasian/White',
        5: 'Pacific Islander',
        6: 'Mixed',
        7: 'Other'
    },
    'TranslationService': {
        0: 'none',
        1: 'yes'
    },
    'PCPonFile': {
        0: 'none',
        1: 'yes'
    },
    'InsuranceCoverage': {
        0: 'none',
        1: 'Medicaid/Medicare',
        2: 'Other'
    },
    'SubstanceUse': {
        0: 'none',
        1: 'any ICD in ICD chart'
    },
    'Education': {
        0: 'none listed',
        1: 'less than high school',
        2: 'high school graduate',
        3: 'some college',
        4: 'Bachelor and above'
    },
    'Employment': {
        0: 'not listed',
        1: 'unemployed',
        2: 'employed'
    },
}

# Replacing numerical values with categories for each column in the DataFrame
for column, mapping in mapping_dictionaries.items():
    df[column] = df[column].replace(mapping)

df.Race.replace({0: "Other"}, inplace=True)

In [5]:
df

Unnamed: 0,OutofHospital,level_1,Age,MaritalStatus,Ethnicity,Race,PrimaryLanguage,TranslationService,PCPonFile,InsuranceCoverage,SubstanceUse,Education,Employment,PrenatalCareHistory,MaternalPathologies,PregnancyComplications,Parity,PregnancyComplicationsCount
0,1,0,34,unmarried,hispanic/latino,Other,Spanish,yes,none,none,none,none listed,unemployed,"Yes, unknown number of visits",O13.9,1,2,2
1,1,1,29,married,not hispanic/latino,Black/African American,Urdu,none,yes,Medicaid/Medicare,none,none listed,unemployed,"Yes, unknown number of visits",E66.9,0,4,0
2,1,2,30,unmarried,hispanic/latino,Other,Spanish,yes,yes,none,none,none listed,unemployed,8,"J45, E11.9, I10",0,3,0
3,1,3,19,unmarried,hispanic/latino,Other,English,none,yes,Other,none,none listed,unemployed,13,"O26.613, K83.1, A74.9, O09.899, Z28.39, O99.019",0,0,0
4,1,4,24,unmarried,hispanic/latino,Other,English,none,none,Medicaid/Medicare,none,none listed,unemployed,9,"O23.40, Z86.50, E66.9",1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,0,61,36,unmarried,not hispanic/latino,Black/African American,English,none,yes,Medicaid/Medicare,none,none listed,employed,0,0,0,6,0
134,0,62,38,unmarried,not hispanic/latino,Black/African American,English,none,yes,Medicaid/Medicare,none,none listed,unemployed,7,0,1,3,1
135,0,63,33,unmarried,not hispanic/latino,Black/African American,English,none,yes,Medicaid/Medicare,none,high school graduate,employed,5,F32.A J45.909 A59.9,1,3,1
136,0,64,27,married,not hispanic/latino,Asian,Spanish,none,none,none,none,none listed,unemployed,6,H90.2,0,3,0


## Chi-Square Test of Independence

This test is used to determine if there is a significant association between two categorical variables. In our case, we can create a Rx2 contingency table where one axis represents the location of birth (in hospital or out of hospital) and the other axis represents whether there was a birth complication (yes or no). Then, we can apply the Chi-Square Test to determine if the occurrence of birth complications is independent of the location of birth.

In [2]:
import pandas as pd
from scipy.stats import chi2_contingency, fisher_exact

# List of variables to check for association with 'OutofHospital'
variables = ["Race", "Age", "PrimaryLanguage", "Ethnicity", "Education", "Employment", "PregnancyComplications"]

# Loop through each variable and perform the Chi-Square Test
results = {}
for variable in variables:
    # Create a contingency table
    contingency_table = pd.crosstab(df['OutofHospital'], df[variable])
    
    # Perform the Chi-Square Test of Independence
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    # Store the results
    results[variable] = {'Chi-Square Value': chi2, 'P-Value': p, 'Degrees of Freedom': dof, 'Expected Frequencies Table': expected}

# Output the results in a beautiful way
print("{:<25} {:<20} {:<20} {:<20}".format("Variable", "Chi-Square Value", "P-Value", "Degrees of Freedom"))
for variable, result in results.items():
    print("{:<25} {:<20} {:<20} {:<20}".format(variable, result['Chi-Square Value'], result['P-Value'], result['Degrees of Freedom']))


NameError: name 'df' is not defined

### Interpretation:

* Chi-Square Value: A larger value indicates a greater difference between the observed counts and what would be expected if the variables were independent.

* P-Value: Tells you whether or not the association is statistically significant. Typically, if the p-value is less than 0.05, we might conclude that there is a significant association between the variables.

* Degrees of Freedom: This is equal to (number of rows - 1) x (number of columns - 1) in the contingency table. It helps in determining the critical value for the chi-square distribution.

For interpretation, we would usually focus on the P-Value. If it's below a significance level (commonly 0.05), it suggests that the observed distribution is significantly different from what you'd expect if the variables were independent, indicating an association.

For the variable `Race`, `Ethnicity` and `PrimaryLanguage`, the very low p-value indicates that there is a statistically significant association between race and whether the birth occurred in or out of the hospital. This suggests that race and the language might play a role in the choice or circumstances of the birth location.

For the variable `PregnancyComplications`, the p-value is much greater than 0.05, suggesting that there is no statistically significant association between pregnancy complications and birth location. Pregnancy complications do not appear to be a determining factor in whether a birth occurs in or out of the hospital.


In [7]:
import pandas as pd
import plotly.express as px

# Assuming your dataframe is named df, and has columns 'OutofHospital', 'PrimaryLanguage' and 'Ethnicity'

# Plot for PrimaryLanguage
fig1 = px.histogram(df, x="PrimaryLanguage", color="OutofHospital",
                    barmode="group", nbins=len(df["PrimaryLanguage"].unique()),
                    labels={"PrimaryLanguage": "Primary Language", "OutofHospital": "Out of Hospital"},
                    title="Number of Births In and Out of Hospital by Primary Language")

fig1.show()


# Plot for Ethnicity
fig2 = px.histogram(df, x="Ethnicity", color="OutofHospital",
                    barmode="group", nbins=len(df["Ethnicity"].unique()),
                    labels={"Ethnicity": "Ethnicity", "OutofHospital": "Out of Hospital"},
                    title="Number of Births In and Out of Hospital by Ethnicity",
)

fig2.show()

# export plotly figures to html
import plotly.io as pio
pio.write_html(fig1, file='fig1.html', auto_open=False)
pio.write_html(fig2, file='fig2.html', auto_open=False)


In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

# Assuming your dataframe is named df, and has columns 'OutofHospital', 'Race', 'Age', 'PrimaryLanguage', 'Ethnicity', 'Education', 'Employment', 'PregnancyComplications', 'Parity'

# List of variables to plot
variables = ["Race", "Age", "PrimaryLanguage", "Ethnicity", "Education", "Employment", "PregnancyComplications", "PregnancyComplicationsCount"]

# Function to create bar chart for a specific variable
def create_trace(variable):
    count_data = df.groupby([variable, 'OutofHospital']).size().reset_index(name='count')
    return {
        'x': count_data[count_data['OutofHospital'] == 1][variable],
        'y': count_data[count_data['OutofHospital'] == 1]['count'],
        'name': 'Out of Hospital',
        'type': 'bar'
    }, {
        'x': count_data[count_data['OutofHospital'] == 0][variable],
        'y': count_data[count_data['OutofHospital'] == 0]['count'],
        'name': 'In Hospital',
        'type': 'bar'
    }

# Create figure
fig = go.Figure()

# Add traces for each variable
for var in variables:
    trace1, trace2 = create_trace(var)
    fig.add_trace(trace1)
    fig.add_trace(trace2)

# Create a dropdown menu
buttons = []
for i, var in enumerate(variables):
    visibility = [False] * len(variables) * 2
    visibility[i*2] = True
    visibility[i*2 + 1] = True
    buttons.append(dict(label=var,
                        method="update",
                        args=[{"visible": visibility},
                              {"title": f"Number of Births In and Out of Hospital by {var}",
                               "xaxis": {"title": var},
                               "yaxis": {"title": "Number of Births"}}]))

# Update layout with dropdown
fig.update_layout(
    updatemenus=[
        go.layout.Updatemenu(
            active=3,  # Set default view to 'Ethnicity'
            buttons=buttons,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.7,
            xanchor="left",
            y=1.3,
            yanchor="top"
        ),
    ],
    barmode="group"
)

# Set initial title
fig.update_layout(title=f"Number of Births In and Out of Hospital by Ethnicity")

# Set initial visibility
visibility = [False] * len(variables) * 2
visibility[3*2] = True
visibility[3*2 + 1] = True
fig.update_traces(visible=False)
fig.data[3*2].visible = True
fig.data[3*2 + 1].visible = True

# Show figure
fig.show()

# Write to HTML
pio.write_html(fig, file='fig.html', auto_open=False)

NameError: name 'df' is not defined

In [9]:
# Plot an histogram on the target variable
import plotly.express as px

fig = px.histogram(df, x="PregnancyComplicationsCount", nbins=10,
                    labels={"PregnancyComplicationsCount": "Number of Pregnancy Complications"},
                    title="Histogram of Number of Pregnancy Complications")

fig.show()

# Write to HTML
pio.write_html(fig, file='histogram.html', auto_open=False)


## Logistic Regresion

Logistic regression is a statistical model that uses a logistic function to model a binary dependent variable. In the context of predicting whether a baby is born in or out of a hospital, it provides a method to estimate the probability of an event (birth location) given a set of related variables, such as race, age, primary language, ethnicity, and education of the parent. Features are assigned coefficients, which reflect the strength and direction of their relationship with the outcome. A positive coefficient suggests that as the feature value increases, the likelihood of the event (in this case, a baby being born out of hospital) also increases, while a negative coefficient suggests the opposite. Thus, logistic regression allows us to identify and quantify the influence of various factors on the outcome, providing insights into relevant and correlated features.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load data
# Assuming that the dataframe is already loaded in variable df

# Preprocessing
# Select variables of interest
variables_of_interest = ["Race", "Age", "PrimaryLanguage", "Ethnicity", "Education", "Employment", "Parity", "PregnancyComplicationsCount"]
X = df[variables_of_interest]

# One hot encoding for categorical variables
categorical_vars = ["Race", "PrimaryLanguage", "Ethnicity", "Education", "Employment"]
one_hot_encoder = OneHotEncoder(drop='first')
one_hot_encoded = one_hot_encoder.fit_transform(X[categorical_vars])
encoded_features = one_hot_encoder.get_feature_names_out(categorical_vars)
encoded_df = pd.DataFrame(one_hot_encoded.toarray(), columns=encoded_features)

# Combine the one-hot-encoded variables with the numeric variables
X = pd.concat([X.drop(categorical_vars, axis=1), encoded_df], axis=1)

# Standardize numerical features
scaler = StandardScaler()
X[['Age']] = scaler.fit_transform(X[['Age']])
X['Parity'] = scaler.fit_transform(X[['Parity']])
X['PregnancyComplicationsCount'] = scaler.fit_transform(X[['PregnancyComplicationsCount']])

# Target variable
y = df['OutofHospital']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic regression|
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Evaluate feature importance
feature_importance = pd.DataFrame({"Feature": X.columns, "Coefficient": log_reg.coef_[0]})
feature_importance = feature_importance.sort_values(by="Coefficient", key=abs, ascending=False).reset_index(drop=True)

print("Feature Importances:")
print(feature_importance)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Predictions on the test set
y_pred = log_reg.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Feature Importances:
                            Feature  Coefficient
0           PrimaryLanguage_English    -0.716497
1              Race_Caucasian/White    -0.589510
2     Ethnicity_not hispanic/latino    -0.479872
3             Employment_unemployed     0.472605
4    PrimaryLanguage_Haitian Creole    -0.446886
5           PrimaryLanguage_Bengali     0.408865
6              PrimaryLanguage_Urdu     0.385484
7           PrimaryLanguage_Spanish     0.374652
8    Education_high school graduate     0.291967
9         PrimaryLanguage_Portugese     0.246905
10           PrimaryLanguage_French     0.244259
11  Education_less than high school     0.237993
12                       Race_Other     0.235588
13            Education_none listed    -0.221615
14            Employment_not listed     0.213527
15                           Parity    -0.213189
16                       Race_Asian    -0.212959
17      PregnancyComplicationsCount    -0.194228
18           Education_some college     0.192872

The logistic regression model reveals a variety of key insights about the influences on whether a baby is born inside or outside of a hospital. The most influential feature is whether the primary language is English, with a negative coefficient of -0.716497. This suggests that when English is the primary language, it is less likely for the baby to be born outside of a hospital. Similar trends are seen with the race of being Caucasian/White and the ethnicity of not being Hispanic/Latino, which also have negative coefficients, indicating these groups are less likely to have out-of-hospital births. On the other hand, certain features increase the likelihood of out-of-hospital birth. For example, if the individual is unemployed, the probability of the birth taking place outside of a hospital increases.

From the model, it can be observed that certain features increase the probability of a baby being born outside of a hospital. Individuals who are unemployed (coefficient of 0.472605) have an increased likelihood of giving birth outside a hospital. Also, people whose primary language is either Bengali, Urdu, or Spanish also show a higher tendency to have out-of-hospital births, as indicated by the positive coefficients of these variables. Similarly, having an education level of a high school graduate or less than high school, also increases the chances of out-of-hospital births. **These features suggest that socio-economic factors, language, and education level significantly influence the location of birth, with lower socio-economic status and non-English primary languages increasing the likelihood of out-of-hospital births.**






The accuracy of the model, at 0.68, indicates a moderate ability to correctly classify whether a birth takes place in or out of the hospital based on the available features. The F1 score, a balance of precision and recall, is 0.71, which suggests a reasonably balanced model.

In [None]:
import plotly.express as px

# Feature Importance
feature_importance = pd.DataFrame({"Feature": X.columns, "Coefficient": log_reg.coef_[0]})
feature_importance = feature_importance.sort_values(by="Coefficient")

fig = px.bar(feature_importance, x='Coefficient', y='Feature', orientation='h',
             title='Feature Importances',
             labels={'Coefficient':'Coefficient', 'Feature':'Feature'},
             color='Coefficient',
             color_continuous_scale=px.colors.diverging.RdBu,
             height=800)

fig.show()

# export to html
pio.write_html(fig, file='feature_importance.html', auto_open=False)



In [17]:
df[variables_of_interest]

Unnamed: 0,PrimaryLanguage
0,Spanish
1,Urdu
2,Spanish
3,English
4,English
...,...
133,English
134,English
135,English
136,Spanish


In [61]:
from sklearn.calibration import LabelEncoder
import statsmodels.api as sm

# Select variables of interest
variables_of_interest = [ "Education","Ethnicity", "Employment", "PregnancyComplications", "Parity"] 
X = df[variables_of_interest]
X = pd.get_dummies(df[variables_of_interest], drop_first=True, dtype=float)
y = df['OutofHospital']

# Fit the model
logit_model = sm.Logit(y, sm.add_constant(X))
result = logit_model.fit()

# Print the summary, including standard errors
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.597334
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:          OutofHospital   No. Observations:                  138
Model:                          Logit   Df Residuals:                      128
Method:                           MLE   Df Model:                            9
Date:                Sun, 25 Jun 2023   Pseudo R-squ.:                  0.1371
Time:                        16:25:07   Log-Likelihood:                -82.432
converged:                       True   LL-Null:                       -95.524
Covariance Type:            nonrobust   LLR p-value:                  0.001906
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const                               0.9407      0.971      0.969  