In [1]:
new_column_names = {
    'Age at delivery': 'Age',
    'Marital status (0 = unmarried, 1 = married)': 'MaritalStatus',
    'Ethnicity (0 = not hispanic/latino, 1 = hispanic/latino)': 'Ethnicity',
    'Race (1 = American Indian, 2 = Asian, 3 = Black/African American, 4 = Caucasian/White, 5 = Pacific Islander, 6 = Mixed, 7 = Other': 'Race',
    'Primary Language': 'PrimaryLanguage',
    'Translation Service Utilized (0 = none, 1 = yes)': 'TranslationService',
    'PCP on file (0 = none, 1 = yes)': 'PCPonFile',
    'Insurance Coverage (0 = none, 1 = Medicaid/Medicare, 2 = Other)': 'InsuranceCoverage',
    'Substance Use (0 = none, 1 = any ICD in ICD chart)': 'SubstanceUse',
    'Educational Attainment (0 = none listed, 1 = less than high school, 2 = high school graduate, 3 = some college, 4 = Bachelor and above': 'Education',
    'Employment Status (0 = not listed, 1 = unemployed, 2 = employed)': 'Employment',
    'History of Prenatal Care (0 = none, if yes list how many visits) ': 'PrenatalCareHistory',
    'Maternal Pathologies (0 = none, If present list ICD)': 'MaternalPathologies',
    'Pregnancy Complications (0 = none, If present list ICD)': 'PregnancyComplications',
    'Parity (0 = nulliparous, 1 = 1 prior child, etc.)': 'Parity'
}

# Rename the columns

In [2]:
import pandas as pd

# read xlsx file

df_out_of_hospital = pd.read_excel('data.xlsx', sheet_name='UOHB Maternal Characteristics')
df_in_hospital = pd.read_excel('data.xlsx', sheet_name='IHB Maternal Characteristics')

# rename column to "Pregnancy Complications"
df_out_of_hospital.rename(columns=new_column_names, inplace=True)
df_in_hospital.rename(columns=new_column_names, inplace=True)

df_out_of_hospital['PregnancyComplications'] = df_out_of_hospital['PregnancyComplications'].apply(lambda x: 1 if x else 0)
df_in_hospital['PregnancyComplications'] = df_in_hospital['PregnancyComplications'].apply(lambda x: 1 if x else 0)

In [26]:
df = pd.concat([df_in_hospital, df_out_of_hospital], keys=['Out of Hospital', 'In Hospital'])
df.reset_index(inplace=True)
df.rename(columns={'level_0': 'OutofHospital'}, inplace=True)
df.Parity = df.Parity.apply(lambda x: f"{x} prior children" if x > 1 else f"No prior child")
df['OutofHospital'] = df['OutofHospital'].apply(lambda x: 1 if x == 'Out of Hospital' else 0)
mapping_dictionaries = {
    'MaritalStatus': {
        0: 'unmarried',
        1: 'married'
    },
    'Ethnicity': {
        0: 'not hispanic/latino',
        1: 'hispanic/latino'
    },
    'Race': {
        1: 'American Indian',
        2: 'Asian',
        3: 'Black/African American',
        4: 'Caucasian/White',
        5: 'Pacific Islander',
        6: 'Mixed',
        7: 'Other'
    },
    'TranslationService': {
        0: 'none',
        1: 'yes'
    },
    'PCPonFile': {
        0: 'none',
        1: 'yes'
    },
    'InsuranceCoverage': {
        0: 'none',
        1: 'Medicaid/Medicare',
        2: 'Other'
    },
    'SubstanceUse': {
        0: 'none',
        1: 'any ICD in ICD chart'
    },
    'Education': {
        0: 'none listed',
        1: 'less than high school',
        2: 'high school graduate',
        3: 'some college',
        4: 'Bachelor and above'
    },
    'Employment': {
        0: 'not listed',
        1: 'unemployed',
        2: 'employed'
    },
}

# Replacing numerical values with categories for each column in the DataFrame
for column, mapping in mapping_dictionaries.items():
    df[column] = df[column].replace(mapping)

df.Race.replace({0: "Other"}, inplace=True)

In [4]:
df

Unnamed: 0,OutofHospital,level_1,Age,MaritalStatus,Ethnicity,Race,PrimaryLanguage,TranslationService,PCPonFile,InsuranceCoverage,SubstanceUse,Education,Employment,PrenatalCareHistory,MaternalPathologies,PregnancyComplications,Parity
0,0,0,34,unmarried,hispanic/latino,Other,Spanish,yes,none,none,none,none listed,unemployed,"Yes, unknown number of visits",O13.9,1,2 prior children
1,0,1,29,married,not hispanic/latino,Black/African American,Urdu,none,yes,Medicaid/Medicare,none,none listed,unemployed,"Yes, unknown number of visits",E66.9,0,4 prior children
2,0,2,30,unmarried,hispanic/latino,Other,Spanish,yes,yes,none,none,none listed,unemployed,8,"J45, E11.9, I10",0,3 prior children
3,0,3,19,unmarried,hispanic/latino,Other,English,none,yes,Other,none,none listed,unemployed,13,"O26.613, K83.1, A74.9, O09.899, Z28.39, O99.019",0,No prior child
4,0,4,24,unmarried,hispanic/latino,Other,English,none,none,Medicaid/Medicare,none,none listed,unemployed,9,"O23.40, Z86.50, E66.9",1,No prior child
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,1,61,36,unmarried,not hispanic/latino,Black/African American,English,none,yes,Medicaid/Medicare,none,none listed,employed,0,0,0,6 prior children
134,1,62,38,unmarried,not hispanic/latino,Black/African American,English,none,yes,Medicaid/Medicare,none,none listed,unemployed,7,0,1,3 prior children
135,1,63,33,unmarried,not hispanic/latino,Black/African American,English,none,yes,Medicaid/Medicare,none,high school graduate,employed,5,F32.A J45.909 A59.9,1,3 prior children
136,1,64,27,married,not hispanic/latino,Asian,Spanish,none,none,none,none,none listed,unemployed,6,H90.2,0,3 prior children


## Chi-Square Test of Independence

This test is used to determine if there is a significant association between two categorical variables. In our case, we can create a Rx2 contingency table where one axis represents the location of birth (in hospital or out of hospital) and the other axis represents whether there was a birth complication (yes or no). Then, we can apply the Chi-Square Test to determine if the occurrence of birth complications is independent of the location of birth.

In [5]:
import pandas as pd
from scipy.stats import chi2_contingency, fisher_exact

# List of variables to check for association with 'OutofHospital'
variables = ["Race", "Age", "PrimaryLanguage", "Ethnicity", "Education", "Employment", "PregnancyComplications", "Parity"]

# Loop through each variable and perform the Chi-Square Test
results = {}
for variable in variables:
    # Create a contingency table
    contingency_table = pd.crosstab(df['OutofHospital'], df[variable])
    
    # Perform the Chi-Square Test of Independence
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    # Store the results
    results[variable] = {'Chi-Square Value': chi2, 'P-Value': p, 'Degrees of Freedom': dof, 'Expected Frequencies Table': expected}

# Output the results in a beautiful way
print("{:<25} {:<20} {:<20} {:<20}".format("Variable", "Chi-Square Value", "P-Value", "Degrees of Freedom"))
for variable, result in results.items():
    print("{:<25} {:<20} {:<20} {:<20}".format(variable, result['Chi-Square Value'], result['P-Value'], result['Degrees of Freedom']))


Variable                  Chi-Square Value     P-Value              Degrees of Freedom  
Race                      23.775378787878786   0.0002397868866226024 5                   
Age                       29.67097681359045    0.32913837848747485  27                  
PrimaryLanguage           21.2409516346847     0.011622476813629868 9                   
Ethnicity                 16.046235992025856   6.181438337887989e-05 1                   
Education                 3.341938364665638    0.5023150401311642   4                   
Employment                1.3853856749311297   0.5002272231619186   2                   
PregnancyComplications    0.06609748803827747  0.7971059642091333   1                   
Parity                    4.21471394984326     0.7547411063634557   7                   


### Interpretation:

* Chi-Square Value: A larger value indicates a greater difference between the observed counts and what would be expected if the variables were independent.

* P-Value: Tells you whether or not the association is statistically significant. Typically, if the p-value is less than 0.05, we might conclude that there is a significant association between the variables.

* Degrees of Freedom: This is equal to (number of rows - 1) x (number of columns - 1) in the contingency table. It helps in determining the critical value for the chi-square distribution.

For interpretation, we would usually focus on the P-Value. If it's below a significance level (commonly 0.05), it suggests that the observed distribution is significantly different from what you'd expect if the variables were independent, indicating an association.

For the variable `Race`, `Ethnicity` and `PrimaryLanguage`, the very low p-value indicates that there is a statistically significant association between race and whether the birth occurred in or out of the hospital. This suggests that race and the language might play a role in the choice or circumstances of the birth location.

For the variable `PregnancyComplications`, the p-value is much greater than 0.05, suggesting that there is no statistically significant association between pregnancy complications and birth location. Pregnancy complications do not appear to be a determining factor in whether a birth occurs in or out of the hospital.


In [6]:
import pandas as pd
import plotly.express as px

# Assuming your dataframe is named df, and has columns 'OutofHospital', 'PrimaryLanguage' and 'Ethnicity'

# Plot for PrimaryLanguage
fig1 = px.histogram(df, x="PrimaryLanguage", color="OutofHospital",
                    barmode="group", nbins=len(df["PrimaryLanguage"].unique()),
                    labels={"PrimaryLanguage": "Primary Language", "OutofHospital": "Out of Hospital"},
                    title="Number of Births In and Out of Hospital by Primary Language")

fig1.show()


# Plot for Ethnicity
fig2 = px.histogram(df, x="Ethnicity", color="OutofHospital",
                    barmode="group", nbins=len(df["Ethnicity"].unique()),
                    labels={"Ethnicity": "Ethnicity", "OutofHospital": "Out of Hospital"},
                    title="Number of Births In and Out of Hospital by Ethnicity",
)

fig2.show()

# export plotly figures to html
import plotly.io as pio
pio.write_html(fig1, file='fig1.html', auto_open=False)
pio.write_html(fig2, file='fig2.html', auto_open=False)


In [7]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

# Assuming your dataframe is named df, and has columns 'OutofHospital', 'Race', 'Age', 'PrimaryLanguage', 'Ethnicity', 'Education', 'Employment', 'PregnancyComplications', 'Parity'

# List of variables to plot
variables = ["Race", "Age", "PrimaryLanguage", "Ethnicity", "Education", "Employment", "PregnancyComplications", "Parity"]

# Function to create bar chart for a specific variable
def create_trace(variable):
    count_data = df.groupby([variable, 'OutofHospital']).size().reset_index(name='count')
    return {
        'x': count_data[count_data['OutofHospital'] == 1][variable],
        'y': count_data[count_data['OutofHospital'] == 1]['count'],
        'name': 'Out of Hospital',
        'type': 'bar'
    }, {
        'x': count_data[count_data['OutofHospital'] == 0][variable],
        'y': count_data[count_data['OutofHospital'] == 0]['count'],
        'name': 'In Hospital',
        'type': 'bar'
    }

# Create figure
fig = go.Figure()

# Add traces for each variable
for var in variables:
    trace1, trace2 = create_trace(var)
    fig.add_trace(trace1)
    fig.add_trace(trace2)

# Create a dropdown menu
buttons = []
for i, var in enumerate(variables):
    visibility = [False] * len(variables) * 2
    visibility[i*2] = True
    visibility[i*2 + 1] = True
    buttons.append(dict(label=var,
                        method="update",
                        args=[{"visible": visibility},
                              {"title": f"Number of Births In and Out of Hospital by {var}",
                               "xaxis": {"title": var},
                               "yaxis": {"title": "Number of Births"}}]))

# Update layout with dropdown
fig.update_layout(
    updatemenus=[
        go.layout.Updatemenu(
            active=3,  # Set default view to 'Ethnicity'
            buttons=buttons,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.1,
            yanchor="top"
        ),
    ],
    barmode="group"
)

# Set initial title
fig.update_layout(title=f"Number of Births In and Out of Hospital by Ethnicity")

# Set initial visibility
visibility = [False] * len(variables) * 2
visibility[3*2] = True
visibility[3*2 + 1] = True
fig.update_traces(visible=False)
fig.data[3*2].visible = True
fig.data[3*2 + 1].visible = True

# Show figure
fig.show()

# Write to HTML
pio.write_html(fig, file='fig.html', auto_open=False)

## Logistic Regresion

0                       Other
1      Black/African American
2                       Other
3                       Other
4                       Other
                ...          
133    Black/African American
134    Black/African American
135    Black/African American
136                     Asian
137    Black/African American
Name: Race, Length: 138, dtype: object

In [32]:
df.PrimaryLanguage.unique()

array(['Spanish', 'Urdu', 'English', 'Portugese', 'Bengali',
       'Haitian Creole', 'French Creole', 'French', 'Portuguese',
       'Arabic'], dtype=object)

In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load data
# Assuming that the dataframe is already loaded in variable df

# Preprocessing
# Select variables of interest
variables_of_interest = ["Race", "Age", "PrimaryLanguage", "Ethnicity", "Education"]
X = df[variables_of_interest]

# One hot encoding for categorical variables
categorical_vars = ["Race", "PrimaryLanguage", "Ethnicity", "Education"]
one_hot_encoder = OneHotEncoder(drop='first')
one_hot_encoded = one_hot_encoder.fit_transform(X[categorical_vars])
encoded_features = one_hot_encoder.get_feature_names_out(categorical_vars)
encoded_df = pd.DataFrame(one_hot_encoded.toarray(), columns=encoded_features)

# Combine the one-hot-encoded variables with the numeric variables
X = pd.concat([X.drop(categorical_vars, axis=1), encoded_df], axis=1)

# Standardize numerical features
scaler = StandardScaler()
X[['Age']] = scaler.fit_transform(X[['Age']])

# Target variable
y = df['OutofHospital']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Evaluate feature importance
feature_importance = pd.DataFrame({"Feature": X.columns, "Coefficient": log_reg.coef_[0]})
feature_importance = feature_importance.sort_values(by="Coefficient", key=abs, ascending=False)

print("Feature Importances:")
print(feature_importance)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Predictions on the test set
y_pred = log_reg.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Best Parameters:
{'C': 0.1, 'penalty': 'l2'}
F1 Score (Test Set): 0.70




25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/raphael.attias/data analysis/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/raphael.attias/data analysis/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/raphael.attias/data analysis/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 54,

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import make_scorer, f1_score

# Load data
# Assuming that the dataframe is already loaded in variable df

# Preprocessing
# Select variables of interest
variables_of_interest = ["Race", "Age", "PrimaryLanguage", "Ethnicity", "Education"]
X = df[variables_of_interest]

# One hot encoding for categorical variables
categorical_vars = ["Race", "PrimaryLanguage", "Ethnicity", "Education"]
one_hot_encoder = OneHotEncoder(drop='first')
one_hot_encoded = one_hot_encoder.fit_transform(X[categorical_vars])
encoded_features = one_hot_encoder.get_feature_names_out(categorical_vars)
encoded_df = pd.DataFrame(one_hot_encoded.toarray(), columns=encoded_features)

# Combine the one-hot-encoded variables with the numeric variables
X = pd.concat([X.drop(categorical_vars, axis=1), encoded_df], axis=1)

# Standardize numerical features
scaler = StandardScaler()
X[['Age']] = scaler.fit_transform(X[['Age']])

# Target variable
y = df['OutofHospital']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Define parameter grid for grid search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

# Define scoring metric
scorer = make_scorer(f1_score)

# Perform grid search with cross-validation
grid_search = GridSearchCV(log_reg, param_grid=param_grid, cv=5, scoring=scorer)
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:")
print(best_params)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred)

print(f'F1 Score (Test Set): {f1:.2f}')
