In [91]:

# Importing Data Manipulation Libraries
import pandas as pd
import numpy as np
# Import Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
# Import Filter Warning Libraries
import warnings
warnings.filterwarnings('ignore')
# Import Logging
import logging
logging.basicConfig(level = logging.INFO,
                    format = '%(asctime)s - %(levelname)s - %(message)s',
                    filemode = 'w',
                    filename = 'model.log',force = True)
# Import Scikit Learn Libraries for Machine Learning Model Building
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,learning_curve,KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
import xgboost
from xgboost import XGBRegressor

# Multicolinearity test and treatment libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from collections import OrderedDict

In [92]:
import logging
def data_ingestion():

    logging.info("Data Ingestion Started...")
    df = pd.read_csv(r"C:\AIReplacementandskilldataset_predictionModel\data\raw\AIREPLACEMENT.csv")
    logging.info("Data Ingestion Completed Successfully")
    return df

In [93]:
def data_exploration(df):
    numerical_cols = df.select_dtypes(exclude='object').columns
    stats=[]
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR

        outlier_flag = "Has Outliers" if df[(df[col] < LW) | (df[col] > UW)].shape[0] > 0 else "No Outliers"

        numerical_stats = OrderedDict({
            "Feature": col,
            "Minimum": df[col].min(),
            "Maximum": df[col].max(),
            "Mean": df[col].mean(),
            "Median": df[col].median(),
            "Mode": df[col].mode().iloc[0] if not df[col].mode().empty else np.nan,
            "25%": Q1,
            "75%": Q3,
            "IQR": IQR,
            "Standard Deviation": df[col].std(),
            "Skewness": df[col].skew(),
            "Kurtosis": df[col].kurt(),
            "Outlier Comment": outlier_flag
        })

        stats.append(numerical_stats)

    report = pd.DataFrame(stats)
    return report

def categorical_summary(df):
    cat_cols = df.select_dtypes(include='object').columns

    summary = []
    for col in cat_cols:
        summary.append({
            "Feature": col,
            "Unique Values": df[col].nunique(),
            "Most Frequent": df[col].mode().iloc[0] if not df[col].mode().empty else None,
            "Missing Values": df[col].isna().sum()
        })

    return pd.DataFrame(summary)


In [94]:
def split_data(data, target_col, test_size=0.3, random_state=42):
    X = data.drop(columns=["automation_risk_percent"])
    y = data["automation_risk_percent"]

    return train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )


In [95]:
def encode_categorical(X_train, X_test):
    X_train = X_train.copy()
    X_test = X_test.copy()

    cat_cols = X_train.select_dtypes(include="object").columns

    encoders = {}

    for col in cat_cols:
        le = LabelEncoder()

        # Fit ONLY on train
        X_train[col] = le.fit_transform(X_train[col])

        # Transform test using same mapping
        X_test[col] = X_test[col].map(
            lambda x: le.transform([x])[0] if x in le.classes_ else -1
        )

        encoders[col] = le

    return X_train, X_test, encoders
    

In [96]:
import logging
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

def train_evaluate_model(model, X_train, X_test, y_train, y_test):
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        return rmse, r2 # Corrected to return calculated r2 value
    except Exception as e:
        logging.error(f"Error training/evaluating model {model.__class__.__name__}: {e}")
        return np.nan, np.nan # Return NaN for RMSE and R2 on error

In [97]:
def compare_models(X_train, X_test, y_train, y_test):
    models = {
        "Linear Regression": LinearRegression(),
        "Lasso": Lasso(),
        "Ridge": Ridge(),
        "Decision Tree": DecisionTreeRegressor(),
        "SVR": SVR(),
        "KNN": KNeighborsRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boost": GradientBoostingRegressor(),
        "Ada Boost": AdaBoostRegressor(),
        "XG Boost": XGBRegressor()
    }

    results = []

    for name, model in models.items():
        rmse, r2 = train_evaluate_model(
            model, X_train, X_test, y_train, y_test
        )
        results.append([name, rmse, r2])

    return pd.DataFrame(
        results, columns=["Model Name", "RMSE", "R2 Score"]).sort_values("R2 Score", ascending=False)

In [98]:
def k_fold_cv(X_train, y_train, folds=10):
    models = {
        "Linear Regression": LinearRegression(),
        "Lasso": Lasso(),
        "Ridge": Ridge(),
        "Decision Tree": DecisionTreeRegressor(),
        "SVR": SVR(),
        "KNN": KNeighborsRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boost": GradientBoostingRegressor(),
        "Ada Boost": AdaBoostRegressor(),
        "XG Boost": XGBRegressor()
    }

    results = []

    for name, model in models.items():
        scores = cross_val_score(
            model, X_train, y_train, cv=folds, scoring="r2"
        )
        results.append([name, scores.mean(), scores.std()])

    return pd.DataFrame(
        results, columns=["Model Name", "CV Mean R2", "CV STD"]).sort_values("CV Mean R2", ascending=False)

In [99]:
def hyperparameter_tuning(X_train, y_train, folds=5):
    tuning_config = {
        "XGBoost": {
            "model": XGBRegressor(),
            "params": {
                "eta": [0.1, 0.2, 0.3],
                "max_depth": [3, 5, 7],
                "gamma": [0, 10, 20],
                "reg_lambda": [0, 1]
            }
        },
        "Random Forest": {
            "model": RandomForestRegressor(),
            "params": {
                "max_depth": [5, 10, 15],
                "max_features": ["sqrt", "log2", 3, 4]
            }
        }
    }

    best_models = {}

    for name, cfg in tuning_config.items():
        grid = GridSearchCV(
            cfg["model"],
            cfg["params"],
            cv=folds,
            scoring="r2",
            n_jobs=-1
        )
        grid.fit(X_train, y_train)

        best_models[name] = grid.best_estimator_

    return best_models

In [100]:
def post_tuning_cv(best_models, X_train, y_train, folds=5):
    results = []

    for name, model in best_models.items():
        scores = cross_val_score(
            model, X_train, y_train, cv=folds, scoring="r2"
        )
        results.append([name, scores.mean(), scores.std()])

    return pd.DataFrame(
        results, columns=["Model Name", "CV Mean R2", "CV STD"]).sort_values("CV Mean R2", ascending=False)

In [101]:
def final_test_evaluation(best_model, X_train, X_test, y_train, y_test):
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    return rmse, r2

In [102]:
def main():
    logging.info("Starting data ingestion...")
    df = data_ingestion()
    logging.info("Data ingestion completed.")
    logging.info("Starting data exploration...")
    numerical_stats_report = data_exploration(df)
    categorical_stats_report = categorical_summary(df)
    logging.info("Data exploration completed.")
    print("Numerical Stats Report:")
    print(numerical_stats_report)
    print("\nCategorical Stats Report:")
    print(categorical_stats_report)
    logging.info("Starting data preprocessing...")
    # Replaced data_preprocessing with existing functions
    X_train, X_test, y_train, y_test = split_data(
        data=df,
        target_col="automation_risk_percent",
        test_size=0.3,
        random_state=42
    )
    X_train, X_test, encoders = encode_categorical(X_train, X_test)
    logging.info("Data preprocessing completed.")
    logging.info("Starting model training...")
    # Replaced model_training with compare_models
    results = compare_models(
        X_train, X_test, y_train, y_test
    )
    print("\nModel Results:")
    print(results)
    logging.info("Starting k-fold cross-validation...")
    cv_results = k_fold_cv(X_train, y_train)
    logging.info("K-fold cross-validation completed.")
    print("\nK-Fold Cross-Validation Results:")
    print(cv_results)
    logging.info("Starting hyperparameter tuning...")
    best_models = hyperparameter_tuning(X_train, y_train,folds=5)
    logging.info("Hyperparameter tuning completed.")
    print("\nBest Models from Hyperparameter Tuning:")
    print(best_models)
if __name__ == "__main__":
    main()

Numerical Stats Report:
                        Feature   Minimum    Maximum          Mean     Median  \
0                        job_id      0.00   14999.00   7499.500000   7499.500   
1                          year   2020.00    2026.00   2022.997200   2023.000   
2       automation_risk_percent      5.00      94.98     46.176347     46.235   
3          ai_replacement_score      4.01     113.07     46.155907     45.675   
4               skill_gap_index      0.00      99.98     50.003708     49.930   
5             salary_before_usd  30003.69  149984.06  89771.375196  89533.050   
6              salary_after_usd  19022.67  191961.21  89870.633937  88787.330   
7         salary_change_percent    -38.37      36.92      0.114268      0.150   
8   skill_demand_growth_percent    -31.88      49.79      5.020461      4.960   
9      remote_feasibility_score     10.01      99.99     54.898078     54.775   
10            ai_adoption_level      0.01      99.98     49.798269     49.435   
11  