**Linear Regression**
- Feature engineering
- Log-transformed target
- Proper preprocessing pipeline
- Regularization (Ridge & Lasso)
- Correct evaluation

## 1. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## 2. Load Dataset

In [2]:
df = pd.read_csv('C:\\Users\\LEGION\\Downloads\\hackathon_ribo\\hackathon\\notebooks\\Data_Handeling\\transformed_data\\final_data_with_changes.csv')
df.head()

Unnamed: 0,price_total,room_type,is_shared_room,is_private_room,max_guests,is_superhost,is_multi_listing,is_business_listing,cleanliness_score,guest_satisfaction_score,...,Safety_Index,Monthly_Average_Net_salary,Meal_at_Inexpensive_Restaurant,Taxi_price_per_Km,Monthly_Basic_Utilities,Monthly_Rent_One_Bedroom_CC,Monthly_Rent_One_Bedroom_OCC,Monthly_Rent_Three_Bedroom_CC,Monthly_Rent_Three_Bedroom_OCC,proximity_index
0,93.788179,Private room,0,1,2.0,1,1,0,10.0,93.0,...,74.3,4502.1,20.5,3.0,262.0,2178.68,1701.26,3725.5,2785.29,0.225448
1,822.169053,Private room,0,1,4.0,1,0,0,8.0,85.0,...,74.3,4502.1,20.5,3.0,262.0,2178.68,1701.26,3725.5,2785.29,1.113955
2,104.351833,Private room,0,1,2.0,1,0,1,9.0,87.0,...,74.3,4502.1,20.5,3.0,262.0,2178.68,1701.26,3725.5,2785.29,0.184291
3,1031.986037,Private room,0,1,4.0,1,0,1,9.0,90.0,...,74.3,4502.1,20.5,3.0,262.0,2178.68,1701.26,3725.5,2785.29,1.110275
4,1078.983564,Private room,0,1,2.0,0,0,0,10.0,98.0,...,74.3,4502.1,20.5,3.0,262.0,2178.68,1701.26,3725.5,2785.29,1.036462


## 3. Feature Engineering

In [3]:
# Interaction features
df['guests_per_room'] = df['max_guests'] / (df['num_bedrooms'] + 1)
df['salary_distance'] = df['Monthly_Average_Net_salary'] * df['distance_city_center']

# Log-transform target
df['price_log'] = np.log1p(df['price_total'])

## 4. Train-Test Split

In [4]:
X = df.drop(['price_total', 'price_log'], axis=1)
y = df['price_log']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## 5. Preprocessing Pipeline

In [5]:
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_features)
])

## 6. Linear Regression Model

In [6]:
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

lr_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


## 7. Evaluation

In [7]:
y_pred_log = lr_pipeline.predict(X_test)
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred_log)

print('Linear Regression R2:', r2_score(y_test, y_pred_log))
print('MAE:', mean_absolute_error(y_test_actual, y_pred_actual))
print('RMSE:', np.sqrt(mean_squared_error(y_test_actual, y_pred_actual)))

Linear Regression R2: 0.8110540743858836
MAE: 105.28763411568951
RMSE: 366.55571795767986


## 8. Ridge Regression (Regularization)

In [8]:
ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

ridge_pipeline.fit(X_train, y_train)
ridge_pred = ridge_pipeline.predict(X_test)
print('Ridge R2:', r2_score(y_test, ridge_pred))

Ridge R2: 0.8103364305436


## 9. Lasso Regression (Feature Selection)

In [9]:
lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=0.001))
])

lasso_pipeline.fit(X_train, y_train)
lasso_pred = lasso_pipeline.predict(X_test)
print('Lasso R2:', r2_score(y_test, lasso_pred))

Lasso R2: 0.8094087534008549


## 10. Cross-Validation

In [10]:
cv_scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
cv_scores.mean()



np.float64(-34.81345888195675)

## 11. Conclusion
- Feature engineering improves linear signal
- Log target stabilizes variance
- Ridge reduces multicollinearity
- Linear Regression remains interpretable