In [None]:
# =============================================================================
# Title: Predicting Critical Temperature for Superconductors Using ML
# Author: [Your Name]
# Date: [Date]
# Description: This script performs an extensive analysis—from exploratory data analysis (EDA),
# feature selection, to building, tuning, and evaluating several regression models and ensemble
# methods—for predicting the critical temperature (critical_temp) of superconductors.
# =============================================================================

# ---------------------------
# 1. Importing Libraries
# ---------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning and Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.metrics import mean_squared_error, r2_score

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor

# External library for advanced gradient boosting (if installed)
import xgboost as xgb

# Warnings filtering for cleaner output
import warnings
warnings.filterwarnings("ignore")

# Set general plot style for consistency in the report
sns.set(style="whitegrid", context="notebook")

# ---------------------------
# 2. Load and Explore the Data
# ---------------------------
# Load the dataset
data_path = "Superconductor.csv"
df = pd.read_csv(data_path)

# Display basic information
print("Dataset Shape:", df.shape)
print("Dataset Columns:\n", df.columns)
print("Dataset Description:")
print(df.describe())

# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# ---------------------------
# 3. Exploratory Data Analysis (EDA)
# ---------------------------
# Plot histogram of target variable 'critical_temp'
plt.figure(figsize=(10, 6))
sns.histplot(df['critical_temp'], kde=True, bins=30)
plt.title("Distribution of Critical Temperature")
plt.xlabel("Critical Temperature")
plt.ylabel("Frequency")
plt.savefig("eda_critical_temp_distribution.png")
plt.show()

# Pairplot and correlation heatmap for feature relationships
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.savefig("eda_correlation_heatmap.png")
plt.show()

# Scatter Matrix for a subset of features (for visualizing pair relationships)
# Select a random sample of columns to avoid overcrowding plots
sample_columns = df.columns[:6]  # Modify as needed for clarity
sns.pairplot(df[sample_columns])
plt.savefig("eda_pairplot_sample.png")
plt.show()

# ---------------------------
# 4. Data Preprocessing and Feature Scaling
# ---------------------------
# Separate features (X) and target (y)
X = df.drop('critical_temp', axis=1)
y = df['critical_temp']

# Train-test split to evaluate model generalization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standard scaling (many ML models benefit from scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# ---------------------------
# 5. Feature Selection
# ---------------------------
# Using SelectKBest (with f_regression) to pick the top features.
# First, determine an appropriate number of features (e.g., k=10 or based on cumulative importance)
selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X_train_scaled, y_train)
scores = pd.DataFrame({"Feature": X.columns, "F-Score": selector.scores_}).sort_values(by="F-Score", ascending=False)
print("SelectKBest Scores:\n", scores)

# Optionally, perform Recursive Feature Elimination (RFE) with a linear model
lr_model = LinearRegression()
rfe = RFE(estimator=lr_model, n_features_to_select=10)
rfe.fit(X_train_scaled, y_train)
features_rfe = X.columns[rfe.support_]
print("Selected features after RFE:\n", features_rfe)

# ---------------------------
# 6. Model Building and Cross-Validation Setup
# ---------------------------
# Define a cross-validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

# Define a function to evaluate and print model performance metrics
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, preds)
    print(f"Model: {model.__class__.__name__}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R^2: {r2:.2f}")
    print("-"*30)
    return rmse, r2

# ---------------------------
# 7. Modeling: Building Various Regressors and Their Ensembles
# ---------------------------

# (a) Baseline Linear Models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso()
}

# (b) Non-linear and Tree-Based Models
models.update({
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Extra Trees": ExtraTreesRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
})

# (c) Support Vector Regression
models.update({
    "SVR": SVR()
})

# ---------------------------
# 8. Hyperparameter Tuning via GridSearchCV for Selected Models
# ---------------------------
# Example: Tuning for RandomForest and GradientBoosting
rf_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
gb_param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Create pipelines that include scaling (for models that benefit from it)
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state=42))
])
gb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Grid search for Random Forest
print("Tuning Random Forest...")
rf_grid = GridSearchCV(rf_pipeline, param_grid={'regressor__' + k: v for k, v in rf_param_grid.items()}, 
                       cv=cv_strategy, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid.fit(X_train, y_train)
print("Best parameters for Random Forest:", rf_grid.best_params_)

# Grid search for Gradient Boosting
print("Tuning Gradient Boosting...")
gb_grid = GridSearchCV(gb_pipeline, param_grid={'regressor__' + k: v for k, v in gb_param_grid.items()}, 
                       cv=cv_strategy, scoring='neg_mean_squared_error', n_jobs=-1)
gb_grid.fit(X_train, y_train)
print("Best parameters for Gradient Boosting:", gb_grid.best_params_)

# Update our models dictionary with tuned models
models["Random Forest Tuned"] = rf_grid.best_estimator_
models["Gradient Boosting Tuned"] = gb_grid.best_estimator_

# ---------------------------
# 9. Evaluating Models Using Cross-Validation
# ---------------------------
model_results = {}
for name, model in models.items():
    # If the model is not a pipeline, include scaling manually
    if name in
