In [4]:
import pandas as pd
import statsmodels.api as sm

# Load dataset from CSV file
df = pd.read_csv("test.csv")

# Remove leading and trailing whitespaces from 'Geography' and 'Education' columns
df['Geography'] = df['Geography'].str.strip()
df['Education'] = df['Education'].str.strip()

# Encoding categorical variables
df["Geography"] = df["Geography"].replace({"British Columbia": 0, "Ontario": 1})
df["Education"] = df["Education"].replace({"Bachelor's degree": 0, "High school graduate": 1}).astype('int64')

# Create interaction term between 'Year' and 'M_(1)_or_F_(0)'
df['Year_Sex_interaction'] = df['Year'] * df['M_(1)_or_F_(0)']

# Define predictors and response variable
X = df[["Year", "Geography", "Education", "M_(1)_or_F_(0)", "Year_Sex_interaction"]]
y = df["Weekly_Wage"]

# Fit linear regression model
model = sm.OLS(y, sm.add_constant(X)).fit()

# Print model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            Weekly_Wage   R-squared:                       0.966
Model:                            OLS   Adj. R-squared:                  0.965
Method:                 Least Squares   F-statistic:                     1020.
Date:                Mon, 10 Apr 2023   Prob (F-statistic):          5.96e-129
Time:                        09:30:22   Log-Likelihood:                -940.71
No. Observations:                 184   AIC:                             1893.
Df Residuals:                     178   BIC:                             1913.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                -3.408e+04 

In [6]:
import streamlit as st
import pandas as pd
import statsmodels.api as sm
import numpy as np

def load_model():
    # Replace the following lines with your model training code and return the trained model
    # For demonstration purposes, I'm using the model training code from the previous answer
    df = pd.read_csv("test.csv")
    df['Geography'] = df['Geography'].str.strip()
    df['Education'] = df['Education'].str.strip()
    df["Geography"] = df["Geography"].replace({"British Columbia": 0, "Ontario": 1})
    df["Education"] = df["Education"].replace({"Bachelor's degree": 0, "High school graduate": 1}).astype('int64')
    df['Year_Sex_interaction'] = df['Year'] * df['M_(1)_or_F_(0)']
    X = df[["Year", "Geography", "Education", "M_(1)_or_F_(0)", "Year_Sex_interaction"]]
    y = df["Weekly_Wage"]
    model = sm.OLS(y, sm.add_constant(X)).fit()
    return model

model = load_model()

def app():
    st.title("Weekly Wage Prediction Model")

    # Input controls
    year = st.slider("Year:", min_value=1997, max_value=2019, value=2000)
    geography = st.selectbox("Region:", options=["British Columbia", "Ontario"])
    education = st.selectbox("Education Level:", options=["Bachelor's degree", "High school graduate"])
    sex = st.radio("Sex:", options=["Female", "Male"])

    # Prepare data for prediction
    newdata = pd.DataFrame({
        "const": [1],
        "Year": [year],
        "Geography": [geography],
        "Education": [education],
        "M_(1)_or_F_(0)": [1 if sex == "Male" else 0]
    })

    newdata["Geography"] = newdata["Geography"].replace({"British Columbia": 0, "Ontario": 1})
    newdata["Education"] = newdata["Education"].replace({"Bachelor's degree": 0, "High school graduate": 1})
    newdata["Year_Sex_interaction"] = newdata["Year"] * newdata["M_(1)_or_F_(0)"]

    # Make prediction using the model
    prediction = model.predict(newdata)

    # Display prediction
    st.write("Predicted weekly wage:", round(prediction[0], 2))

if __name__ == "__main__":
    app()