<h1 style="text-align:center">Build Non-Linear Models Part 1</h1>
<h2 style="text-align:center">Kaggle Submission 2</h2>
<p style="text-align:center">Robert Evans</p>
<p style="text-align:center">School of Technology & Engineering, National University</p>
<p style="text-align:center">DDS-8555: Predictive Analysis</p>
<p style="text-align:center">Dr. Mohammad Yavarimanesh</p>
<p style="text-align:center">January 26, 2025</p>

## Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

## Load Data

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

## Preprocess Data

In [None]:
# Drop the id column as it's not useful for modeling
train = train.drop(columns=['id'])
X_test = test.drop(columns=['id'])

In [None]:
# Separate features and target variable
X = train.drop(columns=['Rings'])
y = train['Rings']

In [None]:
# Preprocess categorical and numerical columns manually
categorical_features = ['Sex']
numerical_features = X.select_dtypes(include=['float64']).columns.tolist()

In [None]:
# One-hot encode categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_categorical_encoded = encoder.fit_transform(X[categorical_features])
X_test_categorical_encoded = encoder.fit_transform(X_test[categorical_features])

In [None]:
# Standardize numerical features
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X[numerical_features])
X_test_numerical_scaled = scaler.fit_transform(X_test[numerical_features])

In [None]:
# Combine preprocessed features
X_preprocessed = np.hstack((X_numerical_scaled, X_categorical_encoded))
X_test_preprocessed = np.hstack((X_test_numerical_scaled, X_test_categorical_encoded))

## Split Data into training and testing sets

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [None]:
y_train_log = np.log1p(y_train)

## Build the model

In [None]:
# Apply PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
# Determine the optimal number of principal components
explained_variance_ratio_cumsum = pca.explained_variance_ratio_.cumsum()
optimal_components = np.argmax(explained_variance_ratio_cumsum >= 0.95) + 1  # 95% variance threshold
print(f"Optimal number of components: {optimal_components}")

In [None]:
# Use only the optimal number of components
pca_optimal = PCA(n_components=optimal_components)
X_train_pca_optimal = pca_optimal.fit_transform(X_train)
X_test_pca_optimal = pca_optimal.transform(X_test)
X_test_preprocessed_pca_optimal = pca_optimal.transform(X_test_preprocessed)

In [None]:
# Fit linear regression on the reduced data
linear_regression = LinearRegression()
linear_regression.fit(X_train_pca_optimal, y_train_log)

In [None]:
# Make predictions (apply inverse transformation to revert to original scale)
y_pred_log = linear_regression.predict(X_test_pca_optimal)
y_test_pred_log = linear_regression.predict(X_test_preprocessed_pca_optimal)
y_pred_pca = np.expm1(y_pred_log)  # Apply expm1 to reverse log1p transformation
y_test_pred_pca = np.expm1(y_test_pred_log)

In [None]:
# Evaluate performance
pca_mse = mean_squared_error(y_test, y_pred_pca)
pca_r2 = r2_score(y_test, y_pred_pca)

In [None]:
print("Principal Components Regression Results:")
print(f"Mean Squared Error: {pca_mse}")
print(f"R^2 Score: {pca_r2}")
print("Cumulative Explained Variance Ratio by PCA Components:")
print(pca_optimal.explained_variance_ratio_.cumsum())

## Visualizations

In [None]:
# PCA Explained Variance Visualization
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio_cumsum) + 1), explained_variance_ratio_cumsum, marker='o', color='green')
plt.title("Cumulative Explained Variance by PCA Components")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance Ratio")
plt.grid()
plt.show()

In [None]:
# Predicted vs. Actual Values Visualization for PCR
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_pca, alpha=0.5, color='orange')
plt.title("Principal Components Regression: Predicted vs. Actual")
plt.xlabel("Actual Rings")
plt.ylabel("Predicted Rings")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()

## Submission

In [None]:
submission = pd.DataFrame({'id':test['id'], 'Rings':y_test_pred_pca})

In [None]:
submission.describe()

In [None]:
submission.to_csv('submission.csv', index=False)