In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [None]:
df = pd.read_csv('insurance_data.csv')
df.info()

In [None]:
# comments:
# (1) there is a small amount of rows with missing values - they can be dropped
# (2) you may want to maake use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner
# (4) the questions part only print answers based on your solution

#### computations

In [None]:
from sklearn.preprocessing import StandardScaler

# Drop rows with missing values
df = df.dropna()

# This function calculates pearson's correlation beetween two predictors
def calc_pearsons_corr(df, pair):
    return df[pair[0]].corr(df[pair[1]])

# checking for multi_collinearity
pair_list = [['age', 'weight'], ['age', 'bmi'], ['age', 'children'], ['bmi', 'weight'], ['bmi', 'children'], ['weight', 'children']]

for pair in pair_list:
    print(calc_pearsons_corr(df, pair))
    
print()
print()
print()

if calc_pearsons_corr(df, ['weight', 'expenses']) > calc_pearsons_corr(df, ['bmi', 'expenses']):
    # Remomve bmi predictor from the data
    df = df.drop(columns = ['bmi'])
else:                    
    # Remomve weight predictor from the data
    df = df.drop(columns = ['weight'])


# Perform one-hot encoding for 'gender' and 'smoker' columns
# Drop 'gender_female' and 'smoker_no' to avoid multi-collinearity
df_encoded = pd.get_dummies(df, columns=['gender', 'smoker'], drop_first=True, dtype=int)

# Define the predictors and the dependent variable
x = df_encoded[['age', 'bmi', 'children', 'gender_male', 'smoker_yes']]
y = df_encoded['expenses']

x = sm.add_constant(x) # Adding the intercept (b_0)
model = sm.OLS(y, x) # Ordinary least squares (training regime)
result = model.fit() # Model training (computing coefficients)

print(result.summary())

# Define the columns to be scaled
columns_to_scale = ['age', 'bmi', 'children', 'gender_male', 'smoker_yes']

# Create a temporary DataFrame with the columns to scale
tmp_df = df_encoded[columns_to_scale]

# Initialize the StandardScaler
scaler = StandardScaler()
scaler.fit(tmp_df)

# Transform the predictors
predictors_scaled = scaler.transform(tmp_df)

# Create a DataFrame with the scaled predictors
df_scaled = pd.DataFrame(predictors_scaled, columns=['age_scaled', 'bmi_scaled', 'children_scaled', 'gender_male_scaled', 'smoker_yes_scaled'])

# Check for missing values in the scaled DataFrame
if df_scaled.isnull().values.any():
    print("Scaled DataFrame contains missing values. Investigate further.")

# Reset index to ensure alignment
df_scaled = df_scaled.reset_index(drop=True)
df_encoded = df_encoded.reset_index(drop=True)

# Define the predictors and the dependent variable
x = df_scaled
y = df_encoded['expenses']

x = sm.add_constant(x)

model = sm.OLS(y, x)
result = model.fit()

print(result.summary())

#### Questions (answer the quesitons, all computations should precede this part)

#### Question 1

In [None]:
# did you remove any numerical predictor from the data?
# if no - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

#### Question 1 - solution:

In [None]:
print("I decided to remove the weight predictor from the data to avoid multi-collinearity, as it is highly correlated with BMI. Additionally, BMI has a stronger linear relationship with the dependent variable (medical expenses) than weight does.")

#### Question 2

In [None]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your solution above
# display your answer as a dataframe (as in assignment 2)

#### Question 2 - solution:

In [None]:
result = {'The amount of money a person is likely to spend on medical expenses each additional year of age': ['258.5421']}
result_df = pd.DataFrame(result)
result_df

#### Question 3

In [None]:
# what predictors have a significant contribution to the medical expenses amount?
# report only signifnicant (P<0.05) predictors sorted by their contribution to the prediction from highest to lowest,
# where for each predictor you specify if it has a positive or a negative effect on the medical expenses

# for categorical variables - specify the effect of individual values that appear signfnicant (e.g., "smoker-yes", "smoker-no")

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

In [None]:
# Create a dictionary with the data
data = {
    'predictor': ['smoker_yes', 'age', 'bmi', 'children'],
    'effect': ['positive', 'positive', 'positive', 'positive']
}

# Create the DataFrame
new_df = pd.DataFrame(data)

# Display the DataFrame
new_df