**DATA 2050 PREPROCESSING**
--

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import math
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
import pickle
from sklearn.inspection import permutation_importance

**CREATING IMPUTED DFs FOR R**
--

In [15]:
data = pd.read_excel('../2050_Data_4-26.xlsx', sheet_name='Data Table')

df = data.copy()

df_cleaned = data.copy().dropna(subset=['birth_wt_pct']).reset_index(drop=True)

# Change FT to 40.0 in gest_age
df_cleaned['gest_age'] = df_cleaned['gest_age'].replace('FT', 40.0)
df_cleaned = df_cleaned[df_cleaned['6mo_feeding_type'] != 'Both Breast and Formula'].reset_index(drop=True)

# Adding new columns to df_cleaned

df_cleaned['birth-1yr_wt_diff'] = df_cleaned.apply(lambda row: row['1yr_wt_pct'] - row['birth_wt_pct'] if pd.notna(row['1yr_wt_pct']) and pd.notna(row['3yr_wt_pct']) else None, axis=1)
df_cleaned['1yr-3yr_wt_diff'] = df_cleaned.apply(lambda row: row['3yr_wt_pct'] - row['1yr_wt_pct'] if pd.notna(row['1yr_wt_pct']) and pd.notna(row['3yr_wt_pct']) else None, axis=1)

  df_cleaned['gest_age'] = df_cleaned['gest_age'].replace('FT', 40.0)


In [16]:
y = df_cleaned['birth_wt_pct']
X = df_cleaned[['race', 'sex', 'gest_age', 'insurance_type', \
                 '4mo_feeding_type', '6mo_feeding_type', '1yr_wt_pct', '3yr_wt_pct', '3yr_bmi_pct', 'age_on_obes', 'birth-1yr_wt_diff', '1yr-3yr_wt_diff']]

**SAVING DATA WITHOUT ENCODING**

In [17]:
df_linreg_not_scaled_with_miss = X.copy()
df_linreg_not_scaled_with_miss['target'] = y
df_linreg_not_scaled_with_miss.to_csv('df_regression_not_scaled_with_miss.csv', index=False)

In [18]:
X.columns

Index(['race', 'sex', 'gest_age', 'insurance_type', '4mo_feeding_type',
       '6mo_feeding_type', '1yr_wt_pct', '3yr_wt_pct', '3yr_bmi_pct',
       'age_on_obes', 'birth-1yr_wt_diff', '1yr-3yr_wt_diff'],
      dtype='object')

**REMOVING CATEGORICAL MISSING VALUES**

In [19]:
from sklearn.impute import SimpleImputer

std_ftrs = ['gest_age', '1yr_wt_pct', '3yr_wt_pct', '3yr_bmi_pct', 'age_on_obes', 'birth-1yr_wt_diff', '1yr-3yr_wt_diff']
onehot_ftrs = ['race', 'sex', 'insurance_type', '4mo_feeding_type', '6mo_feeding_type']

one_hot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# Standard scaler 
std_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('std', std_transformer, std_ftrs),
        ('ohot', one_hot_transformer, onehot_ftrs)])

final_scaler = StandardScaler()

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('final scaler', final_scaler)])


# fit_transform the training set
X_prep = clf.fit_transform(X)
feature_names = clf.get_feature_names_out()

df_linreg = pd.DataFrame(data=X_prep,columns=feature_names)
print(df_linreg.shape)



(62, 24)


In [20]:
df_linreg['target'] = y
df_linreg.to_csv('df_regression_not_imp.csv', index=False)

In [21]:
# Iterative imputing the standard features
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

mask = ['std__gest_age', 'std__1yr_wt_pct', 'std__3yr_wt_pct', 'std__3yr_bmi_pct', 'std__age_on_obes']

print(df_linreg[mask].head())

imputer = IterativeImputer(estimator = RandomForestRegressor(n_estimators=10), random_state=42)
X_impute = imputer.fit_transform(df_linreg)
df_linreg_imp = pd.DataFrame(data=X_impute, columns = df_linreg.columns)

print(df_linreg_imp[mask].head())

   std__gest_age  std__1yr_wt_pct  std__3yr_wt_pct  std__3yr_bmi_pct  \
0       0.085150         0.996720         0.529706          0.547621   
1       0.543552        -1.650156         0.066377         -0.626290   
2       0.543552         0.573355              NaN               NaN   
3      -2.588867         0.178779        -0.712312          0.080332   
4       0.619953              NaN              NaN               NaN   

   std__age_on_obes  
0         -0.671198  
1          1.796996  
2          1.021278  
3          1.373877  
4          1.902776  
   std__gest_age  std__1yr_wt_pct  std__3yr_wt_pct  std__3yr_bmi_pct  \
0       0.085150         0.996720         0.529706          0.547621   
1       0.543552        -1.650156         0.066377         -0.626290   
2       0.543552         0.573355        -1.448439         -1.491782   
3      -2.588867         0.178779        -0.712312          0.080332   
4       0.619953        -1.340704        -1.087774         -2.078606   

  



In [22]:
df_linreg_imp['target'] = y
df_linreg_imp.to_csv('df_regression_imp.csv', index=False)

**NOT SCALED DF**

In [23]:
data = pd.read_excel('../2050_Data_4-26.xlsx', sheet_name='Data Table')

df = data.copy()

df_cleaned = data.copy().dropna(subset=['birth_wt_pct']).reset_index(drop=True)

# Change FT to 40.0 in gest_age
df_cleaned['gest_age'] = df_cleaned['gest_age'].replace('FT', 40.0)

df_cleaned = df_cleaned[df_cleaned['6mo_feeding_type'] != 'Both Breast and Formula'].reset_index(drop=True)

# Adding new columns to df_cleaned

df_cleaned['birth-1yr_wt_diff'] = df_cleaned.apply(lambda row: row['1yr_wt_pct'] - row['birth_wt_pct'] if pd.notna(row['1yr_wt_pct']) and pd.notna(row['3yr_wt_pct']) else None, axis=1)
df_cleaned['1yr-3yr_wt_diff'] = df_cleaned.apply(lambda row: row['3yr_wt_pct'] - row['1yr_wt_pct'] if pd.notna(row['1yr_wt_pct']) and pd.notna(row['3yr_wt_pct']) else None, axis=1)

  df_cleaned['gest_age'] = df_cleaned['gest_age'].replace('FT', 40.0)


In [24]:
y = df_cleaned['birth_wt_pct']
X = df_cleaned[['race', 'sex', 'gest_age', 'insurance_type', '6mo_feeding_type', \
                 '4mo_feeding_type', '1yr_wt_pct', '3yr_wt_pct', '3yr_bmi_pct', 'age_on_obes', 'birth-1yr_wt_diff', '1yr-3yr_wt_diff']]

In [25]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

std_ftrs = ['gest_age', '1yr_wt_pct', '3yr_wt_pct', '3yr_bmi_pct', 'age_on_obes', 'birth-1yr_wt_diff', '1yr-3yr_wt_diff']
onehot_ftrs = ['race', 'sex', 'insurance_type', '6mo_feeding_type', '4mo_feeding_type']

one_hot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# Collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('ohot', one_hot_transformer, onehot_ftrs),
        ('std', FunctionTransformer(lambda x: x, validate=False),std_ftrs)
    ])

clf = Pipeline(steps=[('preprocessor', preprocessor)])

# fit_transform the training set
X_prep = clf.fit_transform(X)
one_hot_feature_names = clf.named_steps['preprocessor'].transformers_[0][1].named_steps['onehot'].get_feature_names_out(onehot_ftrs)
feature_names = list(std_ftrs) + list(one_hot_feature_names)

df_linreg = pd.DataFrame(data=X_prep,columns=feature_names)
print(df_linreg.shape)

(62, 24)


In [26]:
# Iterative imputing the standard features
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

mask = std_ftrs

print(df_linreg[mask].head())

imputer = IterativeImputer(estimator = RandomForestRegressor(n_estimators=1), random_state=42)
X_impute = imputer.fit_transform(df_linreg)
df_linreg_imp_notscaled = pd.DataFrame(data=X_impute, columns = df_linreg.columns)

print(df_linreg_imp_notscaled[mask].head())

   gest_age  1yr_wt_pct  3yr_wt_pct  3yr_bmi_pct  age_on_obes  \
0       1.0         0.0         0.0          0.0          0.0   
1       0.0         1.0         0.0          0.0          0.0   
2       0.0         0.0         0.0          1.0          0.0   
3       1.0         0.0         0.0          0.0          0.0   
4       0.0         1.0         0.0          0.0          0.0   

   birth-1yr_wt_diff  1yr-3yr_wt_diff  
0                1.0              0.0  
1                1.0              0.0  
2                0.0              1.0  
3                1.0              0.0  
4                0.0              1.0  
   gest_age  1yr_wt_pct  3yr_wt_pct  3yr_bmi_pct  age_on_obes  \
0       1.0         0.0         0.0          0.0          0.0   
1       0.0         1.0         0.0          0.0          0.0   
2       0.0         0.0         0.0          1.0          0.0   
3       1.0         0.0         0.0          0.0          0.0   
4       0.0         1.0         0.0         



In [27]:
df_linreg_imp_notscaled['target'] = y
df_linreg_imp_notscaled.to_csv('df_linreg_imp_notscaled.csv', index=False)