In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

# Load the dataset
file_path = '/mnt/data/MS9004_Assignment_2425s2.xlsx'
df = pd.read_excel(file_path)

# Display first few rows
display(df.head())

# Convert categorical variables to numeric
df = pd.get_dummies(df, drop_first=True)

# Exploratory Data Analysis (EDA)
print("Basic Info:")
print(df.info())

print("\nSummary Statistics:")
display(df.describe())

# Pairplot to visualize relationships
sns.pairplot(df)
plt.show()

# Correlation Heatmap - Only numeric columns
plt.figure(figsize=(10, 8))
numeric_df = df.select_dtypes(include=[np.number])  # Select only numeric columns
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

# Splitting data (75% Train, 25% Test)
X = df.drop(columns=['risk'])  # Features
y = df['risk']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2425)

# Standardizing Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit Multiple Linear Regression Model
X_train_const = sm.add_constant(X_train_scaled)  # Adding constant for intercept
model = sm.OLS(y_train, X_train_const).fit()
print(model.summary())

# Model Evaluation
X_test_const = sm.add_constant(X_test_scaled)
y_pred = model.predict(X_test_const)

print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R-squared: {r2_score(y_test, y_pred):.2f}")

# Residuals Analysis
plt.figure(figsize=(8, 5))
sns.residplot(x=y_pred, y=(y_test - y_pred), lowess=True, line_kws={'color': 'red'})
plt.xlabel("Predicted Risk")
plt.ylabel("Residuals")
plt.title("Residuals vs. Predicted Values")
plt.show()

# Model Improvement - Mean Centering
X_train_centered = X_train - X_train.mean()
X_test_centered = X_test - X_train.mean()

# Fit PCA
pca = PCA(n_components=5)  # Reducing dimensions
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Fit new model after PCA
X_train_pca_const = sm.add_constant(X_train_pca)
model_pca = sm.OLS(y_train, X_train_pca_const).fit()
print(model_pca.summary())

# Polynomial Regression
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

model_poly = LinearRegression().fit(X_train_poly, y_train)
y_pred_poly = model_poly.predict(X_test_poly)

print(f"Polynomial Regression R-squared: {r2_score(y_test, y_pred_poly):.2f}")

# Variable Selection - Using only significant predictors
selected_features = ['length', 'cultures', 'nurse']  # Example significant variables
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

X_train_selected_const = sm.add_constant(X_train_selected)
model_selected = sm.OLS(y_train, X_train_selected_const).fit()
print(model_selected.summary())

# Save notebook
print("Analysis Complete. Ready for Submission.")