In [19]:
!pip install pandas 
!pip install numpy
!pip install -U scikit-learn




In [20]:
import sys
import numpy as np 
import pandas as pd 
from pathlib import Path 

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.compose import ColumnTransformer, make_column_selector as selector 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.impute import SimpleImputer 
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import Ridge, LinearRegression 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error 
import joblib


In [43]:
def main(csv_path: str, target_col: str, model_out: str="reg_model.joblib"):
    #----- LOAD 
    df = pd.read_csv(csv_path)
    print(f"\nLoaded: {csv_path}  shape={df.shape}")

    #-----2) Basic cleaning 
    df.columns = (
        df.columns.str.strip().str.replace(r"\s+", "_", regex=True)
    )

    #drop exact duplicates 
    before = len(df)
    df = df.drop_duplicates()
    print(f'Dropped duplicates:{before - len(df)}')

    #ensure target exists and drop row missing target 
    if target_col not in df.columns: 
        raise ValueError(f"Target column '{target_col}' not found. Available: {df.columns.tolist()}")
    
    df = df.dropna(subset=[target_col])

    #---- split features target 
    x = df.drop(columns=[target_col])
    y = df[target_col].astype(float)

    #---feature types 
    num_sel = selector(dtype_include=np.number)
    cat_sel = selector(dtype_exclude=np.number) 

    #-----preprocessing pipelines 
    numeric_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])


    preprocessor = ColumnTransformer([
        ("num", numeric_pipe, num_sel),
        ("cat", categorical_pipe, cat_sel)
    ])

    #-----Chose model 
    model = Ridge(alpha=1.0, random_state=0)
    pipe = Pipeline([
        ("preprocess",preprocessor),
        ("model", model)
    ])

    #----train/test split 
    X_train, X_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2, random_state=42
    )

    #-----fit 
    pipe.fit(X_train, y_train)

    #---evaluate 
    pred = pipe.predict(X_test)
    r2 = r2_score(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    rmse = mean_squared_error(y_test, pred)

    print("\n=== Test Metrics ===")
    print(f"R²   : {r2:.4f}")
    print(f"MAE  : {mae:.4f}")
    print(f"RMSE : {rmse:.4f}")
    
    cv_r2 = cross_val_score(pipe, x, y, cv=5, scoring="r2")
    print(f"\n5-fold CV R²: mean={cv_r2.mean():.4f}  std={cv_r2.std():.4f}")


    #inspect top features, After fitting we can get feature names and coefficients
    try: 
        lin = pipe.named_steps['model']
        if hasattr(lin, 'coef_'):
            #build feature name list from preprocessing
            pre = pipe.named_steps['preprocess']
            num_cols = pre.transformers_[0][2] if isinstance(pre.transformers_[0][2], list) else num_sel(x)
            cat_cols = pre.transformers_[1][2] if isinstance(pre.transformers_[1][2], list) else cat_sel(x)
            cat_feature_names=[] 

            if len(cat_cols) > 0:
                ohe = pre.named_transformers_["cat"].named_steps["onehot"]
                cat_feature_names = ohe.get_feature_names_out(cat_cols).tolist()
        feat_names = list(num_cols) + cat_feature_names

        coefs = pd.DataFrame({
                "feature": feat_names,
                "coef": lin.coef_.ravel() if lin.coef_.ndim > 1 else lin.coef_
            }).sort_values(by='coef', key=lambda s: s.abs(), ascending=False)
        

        print("\n=== Top 10 |coef| features ===")
        print(coefs.head(10).to_string(index=False))





    except Exception as e:
        print(f"\n[Info] Could not list coefficients: {e}")

    joblib.dump(pipe, model_out)
    print(f"\nSaved model to: {model_out}")


    

    

if __name__ == "__main__":
    main("files\sample_regression_data.csv", "salary")




Loaded: files\sample_regression_data.csv  shape=(50, 5)
Dropped duplicates:0

=== Test Metrics ===
R²   : -0.2025
MAE  : 25258.6627
RMSE : 787490254.7327

5-fold CV R²: mean=-0.3034  std=0.3276

=== Top 10 |coef| features ===
                    feature         coef
         city_San Francisco  9637.616838
    education_level_Masters  6382.712473
              city_New York -5673.639160
                city_Austin -3963.977678
education_level_High School -2746.885089
  education_level_Bachelors -2542.333887
           experience_years  1202.939952
        education_level_PhD -1093.493496
                        age  -228.072566

Saved model to: reg_model.joblib


  main("files\sample_regression_data.csv", "salary")
