In [9]:
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, classification_report
from sklearn.model_selection import train_test_split
import numpy as np

def a1(file):
    df = pd.read_excel(file)
    
    # Splitting the data into train and test sets
    X = df[['embed_0']]
    y = df['output']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Linear Regression
    reg = LinearRegression().fit(X_train, y_train)
    y_train_pred_reg = reg.predict(X_train)
    y_test_pred_reg = reg.predict(X_test)
    
    # Classification
    y_train_class = y_train.apply(lambda x: 1 if x > 4 else 0)
    y_test_class = y_test.apply(lambda x: 1 if x > 4 else 0)
    clf = LogisticRegression().fit(X_train, y_train_class)
    y_train_pred_clf = clf.predict(X_train)
    y_test_pred_clf = clf.predict(X_test)
    
    return {
        "reg_model": reg,
        "X_train": X_train, "X_test": X_test,
        "y_train": y_train, "y_test": y_test,
        "y_train_pred_reg": y_train_pred_reg, "y_test_pred_reg": y_test_pred_reg,
        "clf_model": clf,
        "y_train_class": y_train_class, "y_test_class": y_test_class,
        "y_train_pred_clf": y_train_pred_clf, "y_test_pred_clf": y_test_pred_clf
    }

def a2(results):
    # Linear Regression Metrics
    mse_train = mean_squared_error(results["y_train"], results["y_train_pred_reg"])
    mse_test = mean_squared_error(results["y_test"], results["y_test_pred_reg"])
    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)
    mape_train = mean_absolute_percentage_error(results["y_train"], results["y_train_pred_reg"])
    mape_test = mean_absolute_percentage_error(results["y_test"], results["y_test_pred_reg"])
    r2_train = r2_score(results["y_train"], results["y_train_pred_reg"])
    r2_test = r2_score(results["y_test"], results["y_test_pred_reg"])

    return {
        "mse_train": mse_train, "mse_test": mse_test,
        "rmse_train": rmse_train, "rmse_test": rmse_test,
        "mape_train": mape_train, "mape_test": mape_test,
        "r2_train": r2_train, "r2_test": r2_test
    }

def a3(file):
    df = pd.read_excel(file)
    
    # Splitting the data into train and test sets
    X = df.drop(columns=['output'])  # Use all attributes except 'output'
    y = df['output']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Linear Regression
    reg = LinearRegression().fit(X_train, y_train)
    y_train_pred_reg = reg.predict(X_train)
    y_test_pred_reg = reg.predict(X_test)
    
    # Classification
    y_train_class = y_train.apply(lambda x: 1 if x > 4 else 0)
    y_test_class = y_test.apply(lambda x: 1 if x > 4 else 0)
    clf = LogisticRegression().fit(X_train, y_train_class)
    y_train_pred_clf = clf.predict(X_train)
    y_test_pred_clf = clf.predict(X_test)
    
    results = {
        "reg_model": reg,
        "X_train": X_train, "X_test": X_test,
        "y_train": y_train, "y_test": y_test,
        "y_train_pred_reg": y_train_pred_reg, "y_test_pred_reg": y_test_pred_reg,
        "clf_model": clf,
        "y_train_class": y_train_class, "y_test_class": y_test_class,
        "y_train_pred_clf": y_train_pred_clf, "y_test_pred_clf": y_test_pred_clf
    }
    
    metrics = a2(results)
    
    # Print Linear Regression Results
    print("Linear Regression with Multiple Features:")
    print("Coefficient:", results["reg_model"].coef_)
    print("Intercept:", results["reg_model"].intercept_)
    print("First 5 Predictions (Train):", results["y_train_pred_reg"][:5])
    print("First 5 Predictions (Test):", results["y_test_pred_reg"][:5])
    
    # Print Linear Regression Metrics
    print("\nLinear Regression Metrics with Multiple Features:")
    print("Train MSE:", metrics["mse_train"])
    print("Test MSE:", metrics["mse_test"])
    print("Train RMSE:", metrics["rmse_train"])
    print("Test RMSE:", metrics["rmse_test"])
    print("Train MAPE:", metrics["mape_train"])
    print("Test MAPE:", metrics["mape_test"])
    print("Train R²:", metrics["r2_train"])
    print("Test R²:", metrics["r2_test"])
    
    # Print Classification Results
    print("\nClassification with Multiple Features:")
    print("Coefficient:", results["clf_model"].coef_)
    print("Intercept:", results["clf_model"].intercept_)
    print("First 5 Predictions (Train):", results["y_train_pred_clf"][:5])
    print("First 5 Predictions (Test):", results["y_test_pred_clf"][:5])
    
    # Print Classification Report
    print("\nClassification Report (Train) with Multiple Features:")
    print(classification_report(results["y_train_class"], results["y_train_pred_clf"]))
    print("\nClassification Report (Test) with Multiple Features:")
    print(classification_report(results["y_test_class"], results["y_test_pred_clf"]))

def main():
    file = r"C:\Users\Admin\Downloads\training_mathbert 4.xlsx"
    
    # Train the models and get predictions for single attribute
    print("Single Attribute")
    results = a1(file)
    metrics = a2(results)
    
    # Train the models and get predictions for multiple attributes
    print("\nMultiple Attributes")
    a3(file)

if __name__ == "__main__":
    main()


Single Attribute

Multiple Attributes
Linear Regression with Multiple Features:
Coefficient: [125994.29546869 116163.97739098 125428.8726733  126693.19071667
 127851.71527219 121743.87100719 118826.78054845 124321.19371499
 127388.37865968 130890.98031079 125174.53063691 122514.97808261
 126376.98707324 133173.29365451 124189.061878   128723.07806875
 116439.53676112 116962.44394161 100958.60201518 124427.35081834
  98341.14418311 128603.35393494 125051.82430029 132922.07705101
 121694.27517962 126475.05224921 115693.7377141  126145.63384135
 127910.82085974 173761.5505393  136493.87820196 127655.11681159
 121810.83187495 138242.33733351 121587.24828824 127540.09721321
 122190.30711467 133379.69669525 131739.67334345 134485.72467118
 132795.81772781 123157.36225309 130789.38898051 127206.69510942
 128985.69388029 127404.75417478 124582.84682939 127560.52866177
 138012.46636972 120306.38374853 128507.21381102 126167.09402043
 125040.46508338 124472.5291117  130626.4116541  118389.695908