What is Multilinear Regression?

Multilinear Regression is an extension of Linear Regression where:
				One target (output)
				Multiple input features (variables)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

fname = "placement.csv"

# search current directory and parent directories for the file
matches = []
for root in [Path(".")] + list(Path.cwd().parents):
	matches.extend(list(root.rglob(fname)))

if matches:
	path = matches[0]
	print(f"Found file at: {path}")
	dataset = pd.read_csv(path)
	print(dataset.head(10))
else:
	print(f"File '{fname}' not found. Checked current and parent directories.")

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Multivariate linear regression practice
# Edit TARGET below to choose a numeric target column (e.g., 'salary').
# This cell uses `dataset` loaded in a previous cell.

import statsmodels.api as sm

# quick dataset overview
print("Dataset shape:", dataset.shape)
print(dataset.dtypes)
print("\nNumeric columns:", dataset.select_dtypes(include=["number"]).columns.tolist())
print("\nFirst rows:")
display(dataset.head())

# Choose target (change this to the column you want to predict)
numeric_cols = dataset.select_dtypes(include=["number"]).columns.tolist()
if not numeric_cols:
	raise ValueError("No numeric columns available for regression in the dataset.")
TARGET = "salary" if "salary" in numeric_cols else numeric_cols[0]
print(f"\nUsing TARGET = '{TARGET}'. Edit TARGET variable above to try a different target.\n")

# Prepare features (encode categoricals), drop rows with missing values in selected cols
df = dataset.copy()
y = df[TARGET]
X = df.drop(columns=[TARGET])

X = pd.get_dummies(X, drop_first=True)  # encode categorical variables
data = pd.concat([X, y], axis=1).dropna()
X = data.drop(columns=[TARGET])
y = data[TARGET]

print(f"Final data for modeling: {X.shape[0]} rows, {X.shape[1]} features")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions & metrics
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

def rmse(a, b): return np.sqrt(mean_squared_error(a, b))

print("Train R2:", r2_score(y_train, y_pred_train), "RMSE:", rmse(y_train, y_pred_train))
print("Test  R2:", r2_score(y_test, y_pred_test), "RMSE:", rmse(y_test, y_pred_test))

# Coefficients
coef_df = pd.DataFrame({
	"feature": X.columns,
	"coefficient": lr.coef_
}).sort_values(by="coefficient", key=lambda s: s.abs(), ascending=False)
display(coef_df.head(20))

# Statsmodels OLS summary (on training data)
X_train_sm = sm.add_constant(X_train)
ols_model = sm.OLS(y_train, X_train_sm).fit()
print("\nOLS summary (training data):")
print(ols_model.summary())

# Diagnostic plots
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.scatter(y_test, y_pred_test, alpha=0.7)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted (test)")

plt.subplot(1,2,2)
residuals = y_test - y_pred_test
sns.histplot(residuals, kde=True)
plt.title("Residuals (test)")
plt.tight_layout()

^C
Note: you may need to restart the kernel to use updated packages.
