In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [15]:
import pandas as pd
import numpy as np

file_path = 'train_data.csv'
df = pd.read_csv(file_path, encoding= 'latin1')

def convert_to_int_bool_or_percentage(value):
    """
    Convert a value to an integer, boolean, or percentage in decimal form.
    If not possible, leave it as NaN for further handling.
    """
    if pd.isnull(value):
        return np.nan
    try:
        if isinstance(value, str) and value.strip().endswith('%'):
            return float(value.strip('%')) / 100.0  # Convert to decimal
        int_val = int(float(value))
        return int_val
    except (ValueError, TypeError):
        str_val = str(value).strip().lower()
        if str_val in ['true', 'false']:
            return str_val == 'true'
    return np.nan 

df_cleaned = df.applymap(convert_to_int_bool_or_percentage)

df_cleaned.fillna(0, inplace=True)

output_path = 'cleaned_train_data_with_percentages.csv'
df_cleaned.to_csv(output_path, index=False)

print(f"Data cleaned and saved to {output_path}")


Data cleaned and saved to cleaned_train_data_with_percentages.csv


  df_cleaned = df.applymap(convert_to_int_bool_or_percentage)


In [16]:
# Load the dataset
file_path = 'cleaned_train_data_with_percentages.csv'  # Replace with your file path
data = pd.read_csv(file_path, encoding='latin1')

columns = [
    'LabAccred', 
    'TotNumCycles1', 'TotNumCycles2', 'TotNumCycles3', 'TotNumCycles4', 
    'CycleFertPres1', 'CycleFertPres2', 'CycleFertPres3', 'CycleFertPres4', 'CycleFertPresAll', 
    'ND_IntentRetLB1', 'ND_IntentRetLB2', 'ND_IntentRetLB3', 'ND_IntentRetLB4'
]
data = data[columns]

target_columns = ['ND_IntentRetLB1', 'ND_IntentRetLB2', 'ND_IntentRetLB3', 'ND_IntentRetLB4']


data = data.fillna(data.median())

# Define predictors and target variable
X = data.drop(['ND_IntentRetLB1', 'ND_IntentRetLB2', 'ND_IntentRetLB3', 'ND_IntentRetLB4'], axis=1)
y = data[['ND_IntentRetLB1', 'ND_IntentRetLB2', 'ND_IntentRetLB3', 'ND_IntentRetLB4']].mean(axis=1)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.007047693833680553
R-squared: 0.5499381399040061


In [25]:
new_data = pd.DataFrame({
    'LabAccred': [1],  
    'TotNumCycles1': [1],  
    'TotNumCycles2': [1],
    'TotNumCycles3': [1],
    'TotNumCycles4': [1],
    'CycleFertPres1': [1],
    'CycleFertPres2': [1],
    'CycleFertPres3': [1],
    'CycleFertPres4': [1],
    'CycleFertPresAll': [1],
}, index=[0])  # Single-row DataFrame

# Predict outcomes using the trained model
new_predictions = model.predict(new_data)
percentage = round(new_predictions[0] * 100, 2)  # Rounded to 2 decimal places

print("Predicted intent to return live births:", percentage, '%')

Predicted intent to return live births: 27.76 %
