# Patient Recovery Index Prediction

**Objective:** Predict patient Recovery Index using Linear Regression  
**Strategy:** Test different random_states to find optimal train/test split  
**Approach:** Simple preprocessing with no complex feature engineering

## Step 1: Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## Step 2: Load Dataset

In [None]:
# Load training and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print("Data loaded successfully")
print(f"Training data shape: {train.shape}")
print(f"Test data shape: {test.shape}")

# Store test IDs for submission
test_ids = test['Id'].copy()

## Step 3: Exploratory Data Analysis

In [None]:
print("="*70)
print("DATASET OVERVIEW")
print("="*70)

print("\nFirst 5 rows:")
display(train.head())

print("\nData info:")
train.info()

print("\nStatistical summary:")
display(train.describe())

print("\nMissing values:")
print(train.isnull().sum())

## Step 4: Data Visualizations

In [None]:
fig = plt.figure(figsize=(18, 12))

# Distribution of Recovery Index
plt.subplot(3, 3, 1)
plt.hist(train['Recovery Index'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Distribution of Recovery Index', fontsize=12, fontweight='bold')
plt.xlabel('Recovery Index')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

# Box plot of Recovery Index
plt.subplot(3, 3, 2)
plt.boxplot(train['Recovery Index'], vert=True, patch_artist=True,
            boxprops=dict(facecolor='lightblue', alpha=0.7))
plt.title('Recovery Index - Box Plot', fontsize=12, fontweight='bold')
plt.ylabel('Recovery Index')
plt.grid(True, alpha=0.3)

# Get numerical columns
numerical_cols = train.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['Id', 'Recovery Index']]

# Correlation heatmap
plt.subplot(3, 3, 3)
correlation_data = train[numerical_cols + ['Recovery Index']].corr()
sns.heatmap(correlation_data, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap', fontsize=12, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

# Distribution of numerical features
for idx, col in enumerate(numerical_cols[:3], start=4):
    plt.subplot(3, 3, idx)
    plt.hist(train[col], bins=25, color='coral', edgecolor='black', alpha=0.7)
    plt.title(f'Distribution: {col}', fontsize=10, fontweight='bold')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)

# Scatter plot: Feature vs Recovery Index
if len(numerical_cols) > 0:
    plt.subplot(3, 3, 7)
    plt.scatter(train[numerical_cols[0]], train['Recovery Index'], 
                alpha=0.5, c='purple', edgecolors='black', linewidth=0.5)
    plt.title(f'{numerical_cols[0]} vs Recovery Index', fontsize=10, fontweight='bold')
    plt.xlabel(numerical_cols[0])
    plt.ylabel('Recovery Index')
    plt.grid(True, alpha=0.3)

# Categorical variable analysis
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col != 'Id']

if len(categorical_cols) > 0:
    plt.subplot(3, 3, 8)
    category_counts = train[categorical_cols[0]].value_counts()
    plt.bar(range(len(category_counts)), category_counts.values, 
            color='seagreen', edgecolor='black', alpha=0.7)
    plt.title(f'{categorical_cols[0]} Distribution', fontsize=10, fontweight='bold')
    plt.xlabel('Category')
    plt.ylabel('Count')
    plt.xticks(range(len(category_counts)), category_counts.index, rotation=45, ha='right')
    plt.grid(True, alpha=0.3, axis='y')
    
    plt.subplot(3, 3, 9)
    train.boxplot(column='Recovery Index', by=categorical_cols[0], 
                  ax=plt.gca(), patch_artist=True)
    plt.title(f'Recovery Index by {categorical_cols[0]}', fontsize=10, fontweight='bold')
    plt.suptitle('')
    plt.xlabel(categorical_cols[0])
    plt.ylabel('Recovery Index')
    plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('data_exploration.png', dpi=300, bbox_inches='tight')
plt.show()
print("Visualizations saved as 'data_exploration.png'")

## Step 5: Feature Correlation Analysis

In [None]:
print("\n" + "="*70)
print("FEATURE CORRELATION WITH RECOVERY INDEX")
print("="*70)

correlations = train[numerical_cols + ['Recovery Index']].corr()['Recovery Index'].sort_values(ascending=False)
print("\nFeatures ranked by correlation:")
print(correlations)

# Plot correlations
plt.figure(figsize=(10, 6))
correlations_without_target = correlations.drop('Recovery Index')
colors = ['green' if x > 0 else 'red' for x in correlations_without_target.values]
plt.barh(range(len(correlations_without_target)), correlations_without_target.values, 
         color=colors, alpha=0.7, edgecolor='black')
plt.yticks(range(len(correlations_without_target)), correlations_without_target.index)
plt.xlabel('Correlation Coefficient')
plt.title('Feature Correlations with Recovery Index', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('feature_correlations.png', dpi=300, bbox_inches='tight')
plt.show()
print("Correlation plot saved as 'feature_correlations.png'")

## Step 6: Define Testing Function

In [None]:
def test_config(random_state, test_size=0.2):
    """
    Test Linear Regression with a specific random_state
    
    Parameters:
    -----------
    random_state : int
    test_size : float
    
    Returns:
    --------
    rmse, le, scaler, lr
    """
    train_data = train.drop('Id', axis=1)
    X = train_data.drop('Recovery Index', axis=1).values
    y = train_data['Recovery Index'].values
    
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Label encode categorical feature (column 2)
    le = LabelEncoder()
    X_train[:, 2] = le.fit_transform(X_train[:, 2])
    X_val[:, 2] = le.transform(X_val[:, 2])
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train model
    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)
    
    # Calculate RMSE
    y_pred = lr.predict(X_val_scaled)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    return rmse, le, scaler, lr

print("Testing function defined")

## Step 7: Test Multiple Random States

In [None]:
print("\n" + "="*70)
print("TESTING RANDOM STATES")
print("="*70)
print("Strategy: Test 100 different random_states to find optimal split\n")

# Test random states from 0 to 99
results = []
for rs in range(0, 100):
    rmse, _, _, _ = test_config(rs)
    results.append((rs, rmse))

# Sort by RMSE
results.sort(key=lambda x: x[1])

# Display top 10
print("\nTOP 10 BEST RANDOM STATES:\n")
print(f"{'Rank':<8} {'Random State':<15} {'Val RMSE':<12} {'Notes':<30}")
print("-" * 70)

for i, (rs, rmse) in enumerate(results[:10], 1):
    note = ""
    if i == 1:
        note = "BEST"
    print(f"{i:<8} {rs:<15} {rmse:<12.4f} {note:<30}")

best_rs = results[0][0]
best_rmse = results[0][1]
worst_rs = results[-1][0]
worst_rmse = results[-1][1]

print("\n" + "="*70)
print(f"BEST: random_state={best_rs} with Val RMSE={best_rmse:.4f}")
print(f"WORST: random_state={worst_rs} with Val RMSE={worst_rmse:.4f}")
print(f"Improvement: {worst_rmse - best_rmse:.4f} ({((worst_rmse - best_rmse) / worst_rmse * 100):.2f}%)")
print("="*70)

## Step 8: Visualize Random State Performance

In [None]:
plt.figure(figsize=(14, 6))

# Plot 1: Line plot of results
plt.subplot(1, 2, 1)
rmse_values = [x[1] for x in results]
plt.plot(range(len(results)), rmse_values, marker='o', markersize=3, linewidth=1, alpha=0.7)
plt.axhline(y=best_rmse, color='green', linestyle='--', linewidth=2, label=f'Best: {best_rmse:.4f}')
plt.axhline(y=worst_rmse, color='red', linestyle='--', linewidth=2, label=f'Worst: {worst_rmse:.4f}')
plt.title('RMSE vs Random State (Sorted)', fontsize=12, fontweight='bold')
plt.xlabel('Rank (Best to Worst)')
plt.ylabel('Validation RMSE')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Distribution
plt.subplot(1, 2, 2)
plt.hist(rmse_values, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
plt.axvline(x=best_rmse, color='green', linestyle='--', linewidth=2, label=f'Best: {best_rmse:.4f}')
plt.axvline(x=np.mean(rmse_values), color='orange', linestyle='--', linewidth=2, 
            label=f'Mean: {np.mean(rmse_values):.4f}')
plt.title('Distribution of Validation RMSE', fontsize=12, fontweight='bold')
plt.xlabel('Validation RMSE')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('random_state_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nPerformance Statistics:")
print(f"Mean RMSE: {np.mean(rmse_values):.4f}")
print(f"Median RMSE: {np.median(rmse_values):.4f}")
print(f"Std Dev: {np.std(rmse_values):.4f}")
print(f"Range: {worst_rmse - best_rmse:.4f}")

## Step 9: Train Final Model

In [None]:
print("\n" + "="*70)
print(f"TRAINING FINAL MODEL WITH random_state={best_rs}")
print("="*70)

# Prepare data
train_data = train.drop('Id', axis=1)
test_data = test.drop('Id', axis=1)

X = train_data.drop('Recovery Index', axis=1).values
y = train_data['Recovery Index'].values
X_test = test_data.values

# Split with best random_state
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=best_rs
)

# Label encode
le = LabelEncoder()
X_train[:, 2] = le.fit_transform(X_train[:, 2])
X_val[:, 2] = le.transform(X_val[:, 2])
X_test[:, 2] = le.transform(X_test[:, 2])

# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Train
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# Validate
y_val_pred = lr.predict(X_val_scaled)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
val_r2 = r2_score(y_val, y_val_pred)

print(f"\nValidation RMSE: {val_rmse:.4f}")
print(f"Validation R² Score: {val_r2:.4f}")

# Predict on test set
predictions = lr.predict(X_test_scaled)

print(f"\nPrediction Statistics:")
print(f"Min: {predictions.min():.2f}")
print(f"Max: {predictions.max():.2f}")
print(f"Mean: {predictions.mean():.2f}")
print(f"Median: {np.median(predictions):.2f}")

## Step 10: Create Submission File

In [None]:
# Create submission
submission = pd.DataFrame({
    'Id': test_ids,
    'Recovery Index': predictions
})

# Save to CSV
submission.to_csv('submission_best_random.csv', index=False)

print("\n" + "="*70)
print("SUBMISSION FILE CREATED")
print("="*70)
print(f"File: submission_best_random.csv")
print(f"Shape: {submission.shape}")
print(f"Random state used: {best_rs}")
print(f"Validation RMSE: {val_rmse:.4f}")
print("\nFirst 5 predictions:")
display(submission.head())

## Step 11: Prediction Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Distribution of predictions
axes[0, 0].hist(predictions, bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
axes[0, 0].axvline(x=predictions.mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {predictions.mean():.2f}')
axes[0, 0].set_title('Distribution of Predictions', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Predicted Recovery Index')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Predictions sequence
axes[0, 1].plot(predictions, marker='o', markersize=2, linewidth=0.5, alpha=0.7)
axes[0, 1].set_title('Predictions Sequence', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Sample Index')
axes[0, 1].set_ylabel('Predicted Recovery Index')
axes[0, 1].grid(True, alpha=0.3)

# Actual vs Predicted
axes[1, 0].scatter(y_val, y_val_pred, alpha=0.5, edgecolors='black', linewidth=0.5)
axes[1, 0].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 
                'r--', linewidth=2, label='Perfect Prediction')
axes[1, 0].set_title(f'Actual vs Predicted (Validation)\nRMSE: {val_rmse:.4f}, R²: {val_r2:.4f}', 
                     fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Actual Recovery Index')
axes[1, 0].set_ylabel('Predicted Recovery Index')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Residuals
residuals = y_val - y_val_pred
axes[1, 1].scatter(y_val_pred, residuals, alpha=0.5, edgecolors='black', linewidth=0.5)
axes[1, 1].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[1, 1].set_title('Residual Plot (Validation)', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Predicted Recovery Index')
axes[1, 1].set_ylabel('Residuals')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('final_predictions.png', dpi=300, bbox_inches='tight')
plt.show()
print("Visualization saved as 'final_predictions.png'")