In [3]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import StandardScaler
import numpy as np
from statsmodels.tsa.stattools import adfuller

# Load the dataset (replace with the actual path to your dataset)
data = pd.read_csv('dataset/data.csv')

# Prepare the data for ARIMAX model
exog_columns = ['Age', 'BMI', 'UnusualBleeding', 'NumberofDaysofIntercourse', 'Breastfeeding', 'Numberpreg']
filtered_data_updated = data[['CycleNumber', 'LengthofCycle'] + exog_columns].dropna()

# Ensure all columns for exog and target variables are numeric
filtered_data_updated[exog_columns] = filtered_data_updated[exog_columns].apply(pd.to_numeric, errors='coerce')
filtered_data_updated['LengthofCycle'] = pd.to_numeric(filtered_data_updated['LengthofCycle'], errors='coerce')

# Fill missing values with the median of each column
filtered_data_updated.fillna(filtered_data_updated.median(), inplace=True)

# Split the data into training and testing sets
train_size = int(len(filtered_data_updated) * 0.8)
train_target, test_target = filtered_data_updated['LengthofCycle'][:train_size], filtered_data_updated['LengthofCycle'][train_size:]
train_exog, test_exog = filtered_data_updated[exog_columns][:train_size], filtered_data_updated[exog_columns][train_size:]

# Fit the scaler on the training data (handling potential issues with empty values)
scaler = StandardScaler()
train_exog = train_exog.apply(pd.to_numeric, errors='coerce')  # Ensure all features are numeric
train_exog.fillna(train_exog.median(), inplace=True)  # Fill missing values with the median
scaler.fit(train_exog)

# Save the scaler using pickle
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Define the ARIMAX model function
def arimax_model(p, q, exog_train, target_train):
    model = ARIMA(target_train, exog=exog_train, order=(p, 0, q))
    model_fitted = model.fit()
    return model_fitted

# Fit the ARIMAX model with p=1, q=1 (simplified for quicker fitting)
p, q = 1, 1
best_model_simple = arimax_model(p, q, train_exog, train_target)

# Display the summary of the model
print(best_model_simple.summary())

# Save the trained ARIMAX model using pickle
pickle_model_filename = 'arima_menstrual_cycle_model_simple.pkl'
with open(pickle_model_filename, 'wb') as file:
    pickle.dump(best_model_simple, file)

print(f"Model saved to {pickle_model_filename}")

# --- Prediction Part Below ---

# Load the saved ARIMAX model
arima_model = pickle.load(open('arima_menstrual_cycle_model_simple.pkl', 'rb'))

# Load the saved scaler
scaler = pickle.load(open('scaler.pkl', 'rb'))

# Function to predict the menstrual cycle length based on user input
def predict_cycle_length(user_input, scaler=None):
    # Prepare the data in the same format as used for model training
    user_data = pd.DataFrame([user_input])

    # Remove 'CycleNumber' from user input (it's not a feature for prediction)
    user_data = user_data[['Age', 'BMI', 'UnusualBleeding', 'NumberofDaysofIntercourse', 'Breastfeeding', 'Numberpreg']]

    # Clean the input data (ensure all values are numeric)
    user_data = user_data.apply(pd.to_numeric, errors='coerce')
    user_data.fillna(user_data.median(), inplace=True)

    # If scaling was used during training, apply the scaler to the input data
    if scaler:
        user_data_scaled = scaler.transform(user_data)  # Use transform instead of fit_transform
    else:
        user_data_scaled = user_data  # If no scaling was done during training, use raw input data
    
    # Make a prediction using the ARIMAX model (ensure external variables are included)
    predicted_value = arima_model.predict(start=0, end=0, exog=user_data_scaled)
    
    # Return the predicted cycle length
    return predicted_value[0]  # Access the first value of the prediction

# Example user input (replace with actual inputs)
user_input = {
    'CycleNumber': 1,  # for example, 100th cycle (this will be removed during preprocessing)
    'Age': 28,
    'BMI': 22.5,
    'UnusualBleeding': 1,  # Example binary input: 0 = No, 1 = Yes
    'NumberofDaysofIntercourse': 10,  # Example numerical input
    'Breastfeeding': 1,  # Example binary input: 0 = No, 1 = Yes
    'Numberpreg': 1  # Example input
}

# Call the function with the user input
predicted_cycle_length = predict_cycle_length(user_input, scaler)

# Print the predicted cycle length
print("Predicted Menstrual Cycle Length (Days):", predicted_cycle_length)


                               SARIMAX Results                                
Dep. Variable:          LengthofCycle   No. Observations:                 1332
Model:                 ARIMA(1, 0, 1)   Log Likelihood               -3388.649
Date:                Sat, 22 Feb 2025   AIC                           6797.298
Time:                        17:17:09   BIC                           6849.243
Sample:                             0   HQIC                          6816.765
                               - 1332                                         
Covariance Type:                  opg                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                        34.0619      1.784     19.094      0.000      30.565      37.558
Age                          -0.1716      0.055     -3.123      0.002      -0.279     