# Linear Regression

In [None]:

import pandas as pd
import numpy as np


data = pd.read_csv('RegressionMetrics.csv')  

# columns
features = ['methods_per_class', 'params_per_method', 'cyclomatic_complexity', 
            'comment_density', 'loc_per_method', 'public_methods', 'nesting_depth','halstead_volume']
target = 'overall'

X = data[features]
y = data[target]

# print("Data shape:", X.shape)
# print(X.head())

#  Splitting the data s (80% train, 20% test)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# print("Training set:", X_train.shape)
# print("Testing set:", X_test.shape)



from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

# Initialize Linear Regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predict
y_pred_lin = lin_reg.predict(X_test)

# Evaluation metrics for Linear Regression
r2_lin = r2_score(y_test, y_pred_lin)
mae_lin = mean_absolute_error(y_test, y_pred_lin)
rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))

print("\n🔵 Linear Regression Performance:")
print(f"R² Score: {r2_lin:.4f}")
print(f"MAE: {mae_lin:.4f}")
print(f"RMSE: {rmse_lin:.4f}")

# 5-Fold Cross-validation scores
cv_r2_lin = cross_val_score(lin_reg, X, y, cv=5, scoring='r2').mean()
cv_mae_lin = -cross_val_score(lin_reg, X, y, cv=5, scoring='neg_mean_absolute_error').mean()
cv_rmse_lin = np.sqrt(-cross_val_score(lin_reg, X, y, cv=5, scoring='neg_mean_squared_error').mean())

print(f"Cross-validated R²: {cv_r2_lin:.4f}")
print(f"Cross-validated MAE: {cv_mae_lin:.4f}")
print(f"Cross-validated RMSE: {cv_rmse_lin:.4f}")







coefs = lin_reg.coef_
intercept = lin_reg.intercept_

equation = f"overall = {intercept:.4f}"
for coef, feature in zip(coefs, features):
    equation += f" + ({coef:.4f} * {feature})"

print("\n Linear Regression Equation:")
print(equation)


### Linear Regression


##  Visualizing the coefficients (feature importance)

In [None]:


import seaborn as sns
import matplotlib.pyplot as plt

# Create a dataframe for easier plotting
coef_df = pd.DataFrame({
    'Feature': features,
    'Coefficient': coefs
})

# Sort by magnitude of coefficient
coef_df = coef_df.sort_values(by='Coefficient', ascending=False)

# Plot
plt.figure(figsize=(12,6))
sns.barplot(x='Coefficient', y='Feature', data=coef_df, palette='coolwarm')
plt.title('Feature Importance based on Linear Regression Coefficients')
plt.axvline(0, color='black', linewidth=0.8)
plt.grid(axis='x')
plt.show()


# if we use Natural log for featurs 

In [11]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# 1. Apply natural log to features
X_log = np.log(X + 1e-8)  # small value to avoid log(0)

# 2. Create and fit linear regression
model_log = LinearRegression()
model_log.fit(X_log, y)

# 3. Predict
y_pred_log = model_log.predict(X_log)

# 4. Calculate R² score
r2_log = r2_score(y, y_pred_log)
print(f"New R² Score after log transform: {r2_log:.4f}")

# 5. Print the new equation
coefs = model_log.coef_
intercept = model_log.intercept_
feature_names = X.columns

equation = f"overall = {intercept:.4f} "
for coef, name in zip(coefs, feature_names):
    equation += f"+ ({coef:.4f} * log({name})) "

print("\nNew Regression Equation:")
print(equation)


New R² Score after log transform: 0.4473

New Regression Equation:
overall = 9.4692 + (-1.0243 * log(methods_per_class)) + (0.0279 * log(params_per_method)) + (-1.0536 * log(cyclomatic_complexity)) + (0.2572 * log(comment_density)) + (-0.3954 * log(loc_per_method)) + (1.4633 * log(public_methods)) + (-0.4576 * log(nesting_depth)) + (-1.2559 * log(halstead_volume)) 
