# Linear model

With a prior EDA we preprocess our data and train our model.
Linear models are easy to interpret, but they require more data preprocessing and rely on many hypothesis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import plotly.express as px
import seaborn as sns

# Load dataset
df = pd.read_csv('train.csv') 


## 1.1 convert types

In [53]:
df[['MSSubClass', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'YrSold']] = df[['MSSubClass', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'YrSold']].astype('object')

## 1.2 drop variables with most values missing

In [54]:
threshold = 0.30  # 30%
df = df.loc[:, df.isna().mean() <= threshold]

threshold = 0.30  # 30%
missing_ratio = df.isna().mean()

# Columns to drop (more than 30% missing)
dropped_columns_most_missing = missing_ratio[missing_ratio > threshold].index.tolist()

# Drop from training data
df = df.loc[:, missing_ratio <= threshold]

## 1.3 Fill missing values with "Missing" for categorical variables

In [55]:
# Identify categorical columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns

# Fill missing values with "Missing"
df[cat_cols] = df[cat_cols].fillna("Missing")

  df[cat_cols] = df[cat_cols].fillna("Missing")


## 1.4 Fill missing values with the mean for quantitative variables

In [56]:
#save numeric means for missing test values
means = df.select_dtypes(include='number').mean()

#imputer numeric par moyenne dans train
df = df.fillna(df.select_dtypes(include='number').mean())

## 1.5 create variables for clarity HouseAge, YearsSinceRemod...

In [57]:
df_dropped = df.copy()

#'YearBuilt', 'YearRemodAdd, 'MoSold', 'YrSold' convert to quantitative
# List of columns to convert
columns_to_convert = ['YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold']

# Convert each to integer safely
for col in columns_to_convert:
    df_dropped.loc[:, col] = df_dropped[col].astype(int)

# Set reference year
reference_year = df_dropped['YrSold'].max()

# Create quantitative features safely
df_dropped.loc[:, 'HouseAge'] = reference_year - df_dropped['YearBuilt']
df_dropped.loc[:, 'YearsSinceRemod'] = reference_year -df_dropped['YearRemodAdd']
df_dropped.loc[:, 'TimeIndex'] = (
    (df_dropped['YrSold'] - df_dropped['YrSold'].min()) * 12 + df_dropped['MoSold']
)

df_dropped = df_dropped.drop([
    'YearBuilt', 'YearRemodAdd', 'YrSold', 'MoSold'
], axis=1) 

# Convert object columns to numeric, forcing errors to NaN
cols_to_convert = ['HouseAge', 'YearsSinceRemod', 'TimeIndex']
df_dropped[cols_to_convert] = df_dropped[cols_to_convert].apply(pd.to_numeric, errors='coerce')

## 1.6 Correct skewness

For variables with a significant skewness from the test pandas.DataFrame.skew()

In [58]:
#case 1: only binary no log

#Most houses don't have low-quality finished square footage.
#Very few do, with highly varied amounts.
cols_to_binary_only_0 = ['BsmtHalfBath', 'EnclosedPorch', 'ScreenPorch']

for col in cols_to_binary_only_0:
    df_dropped[f'Has{col}'] = (df_dropped[col] > 0).astype(int)
    df_dropped.drop(columns=[col], inplace=True)

# = 1 or not

df_dropped['HasKitchen'] = (df_dropped['KitchenAbvGr'] == 1).astype(int)
df_dropped.drop(columns=['KitchenAbvGr'], inplace=True)


#case 2: binary+log

cols = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '2ndFlrSF', 'WoodDeckSF', 'OpenPorchSF']  # Replace with actual column names

for col in cols:
    df_dropped[f'Has{col}'] = (df_dropped[col] > 0).astype(int)
    df_dropped[f'{col}_log'] = np.log1p(df_dropped[col])

#case 3: just drop 
df_dropped.drop(columns=['LowQualFinSF','3SsnPorch', 'PoolArea', 'MiscVal'], inplace=True)

# case 4: log only

df_dropped.drop(columns=[ 'TimeIndex'], inplace=True)# cyclique, pas significative

# List of variables to transform
vars_to_log = ['LotFrontage', 'LotArea', 'TotalBsmtSF', 'GrLivArea',  'SalePrice', 'BsmtUnfSF', '1stFlrSF']

# Create log-transformed versions with "_log" suffix
for col in vars_to_log:
    df_dropped[col + '_log'] = np.log1p(df_dropped[col])  # log1p handles zero safely

# Drop the original columns
df_dropped.drop(columns=vars_to_log, inplace=True)

## 1.7 Cap outliers

In [59]:
# cap outliers
#outliers cap

num_features = [col for col in df_dropped.select_dtypes(include='number') if col != 'Id']
 
# Store the limits for each column
caps = {}

for col in num_features:
    q_low = df_dropped[col].quantile(0.01)
    q_high = df_dropped[col].quantile(0.99)
    
    # Save the thresholds
    caps[col] = (q_low, q_high)
    
    # Apply clipping
    df_dropped[col] = df_dropped[col].clip(lower=q_low, upper=q_high)



## 1.8 Categorical variable (merge rare categories into the most frequent)

In [60]:
def clean_categorical_variables(
    df, 
    target_col='SalePrice_log', 
    threshold=0.03, 
    tol=0.10, 
    min_count=30
):
    """
    Cleans all categorical variables:
    - Fills missing values
    - Groups rare non-predictive categories into 'Other'
    - Merges 'Other' into closest price category if too small

    Returns modified DataFrame.
    """
    df = df.copy()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    global_mean = df[target_col].mean()
    mappings = {}
    for col in cat_cols:
        # Step 1: Fill missing
        df[col] = df[col].fillna("Missing")

        # Step 2: Frequency and stats
        freq = df[col].value_counts(normalize=True)
        rare_cats = freq[freq < threshold].index
        stats = df.groupby(col)[target_col].agg(['count', 'mean'])

        # Step 3: Decide which rare categories to keep
        keep_rare = []
        group_rare = []

        for cat in rare_cats:
            count = stats.loc[cat, 'count']
            mean = stats.loc[cat, 'mean']
            deviation = abs(mean - global_mean) / global_mean

            if count >= min_count and deviation > tol:
                keep_rare.append(cat)
            else:
                group_rare.append(cat)

        # Step 4: Replace rare with 'Other'
        df[col] = df[col].apply(lambda x: 'Other' if x in group_rare else x)

        final_merge_target = None
        # Step 5: Merge 'Other' if too small

        if 'Other' in df[col].values:
            other_mask = df[col] == 'Other'
            if other_mask.sum() < min_count:
                # Merge 'Other' into the most frequent existing category (excluding 'Other')
                final_merge_target = df.loc[~other_mask, col].value_counts().idxmax()
                df.loc[other_mask, col] = final_merge_target
                print(f"'{col}': 'Other' merged into most frequent category '{final_merge_target}'")

 # Save mapping
        mappings[col] = {
            'group_rare': group_rare,
            'final_merge_target': final_merge_target
        }

    return df, mappings



df_rare_cat, cat_mappings = clean_categorical_variables(df_dropped)


#remove variables with one category

# Step 1: Identify columns dropped from training
cols_dropped_one_cat = df_dropped.columns[df_dropped.nunique(dropna=False) <= 1]
df_dropped = df_dropped.loc[:, df_dropped.nunique(dropna=False) > 1]

# One-hot encode all object or category dtype columns
df_dropped = pd.get_dummies(df_dropped, drop_first=True)

'MSZoning': 'Other' merged into most frequent category 'RL'
'Street': 'Other' merged into most frequent category 'Pave'
'Utilities': 'Other' merged into most frequent category 'AllPub'
'LotConfig': 'Other' merged into most frequent category 'Inside'
'LandSlope': 'Other' merged into most frequent category 'Gtl'
'Condition2': 'Other' merged into most frequent category 'Norm'
'RoofMatl': 'Other' merged into most frequent category 'CompShg'
'ExterQual': 'Other' merged into most frequent category 'TA'
'HeatingQC': 'Other' merged into most frequent category 'Ex'
'GarageQual': 'Other' merged into most frequent category 'TA'


## 1.9 Chose/train model

In [63]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

# Step 1: Separate features and target
X_train = df_dropped.drop(columns=['Id', 'SalePrice_log'])  # Replace 'Price' with your actual target if named differently
y = df_dropped['SalePrice_log']


Cross-validated RMSE scores: [0.11869494 0.10926476 0.14761173 0.12038232 0.11104632]
Average RMSE: 0.12140001261117614


In [35]:
from sklearn.linear_model import RidgeCV

alphas = np.logspace(-4, 4, 50)
ridge_cv = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_cv.fit(X_train, y)

print("Best alpha:", ridge_cv.alpha_)



Best alpha: 7.9060432109076855


In [36]:
#the best
from sklearn.linear_model import Ridge
model = Ridge(alpha=7.9060432109076855)

model.fit(X_train, y)

# Step 4: Cross-validation setup
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Step 5: Run cross-validation and evaluate using negative RMSE
scores = cross_val_score(model, X_train, y, scoring='neg_root_mean_squared_error', cv=cv)

# Step 6: Print results
print("Cross-validated RMSE scores:", -scores)
print("Average RMSE:", -scores.mean())

Cross-validated RMSE scores: [0.11649896 0.1063896  0.14470909 0.11129576 0.10533639]
Average RMSE: 0.11684595984728216


In [37]:
from sklearn.linear_model import LassoCV
import numpy as np

alphas = np.logspace(-4, 4, 50)
lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=0)
lasso_cv.fit(X_train, y)

print("Best alpha:", lasso_cv.alpha_)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Best alpha: 0.0006551285568595509


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [14]:
#final
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.0006551285568595509)

model.fit(X_train, y)

# Step 4: Cross-validation setup
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Step 5: Run cross-validation and evaluate using negative RMSE
scores = cross_val_score(model, X_train, y, scoring='neg_root_mean_squared_error', cv=cv)

# Step 6: Print results
print("Cross-validated RMSE scores:", -scores)
print("Average RMSE:", -scores.mean())

Cross-validated RMSE scores: [0.11710181 0.10522767 0.14891445 0.10816945 0.1037963 ]
Average RMSE: 0.11664193690414784


### 1.9.1 feature selection

In [38]:
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


# Step 1: Separate features and target
X_train = df_dropped.drop(columns=['Id', 'SalePrice_log'])  # Replace 'Price' with your actual target if named differently
y = df_dropped['SalePrice_log']

# 2. Split train/test
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2, random_state=42)

# 3. Feature selection with LassoCV
lasso_cv = LassoCV(cv=5, random_state=42).fit(X_train, y_train)

# 4. Select features with threshold="mean"
selector = SelectFromModel(lasso_cv, threshold="mean", prefit=True)
X_train_sel = selector.transform(X_train)



In [39]:
# 5. RidgeCV model with cross-validation
alphas = np.logspace(-4, 4, 50)
ridge_cv = RidgeCV(alphas=alphas, cv=5)
ridge_cv.fit(X_train_sel, y_train)

# 6. Evaluate on test set
y_pred = ridge_cv.predict(X_train_sel)
r2 = r2_score(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)

print(f"Best alpha (RidgeCV): {ridge_cv.alpha_}")
print(f"Test R² score: {r2:.4f}")
print(f"Test MSE: {mse:.2f}")

Best alpha (RidgeCV): 7.9060432109076855
Test R² score: 0.7928
Test MSE: 0.03


In [40]:
# 7. View selected features
selected_features = X_train.columns[selector.get_support()]
print("Selected features:", selected_features.tolist())

Selected features: ['MSSubClass', 'OverallQual', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea', 'OpenPorchSF', 'HouseAge', 'YearsSinceRemod', 'BsmtUnfSF_log']


In [19]:
# 5. RidgeCV model with cross-validation
alphas = np.logspace(-4, 4, 50)
lasso_cv = LassoCV(alphas=alphas, cv=5)
lasso_cv.fit(X_train_sel, y_train)

# 6. Evaluate on test set
y_pred = lasso_cv.predict(X_train_sel)
r2 = r2_score(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)

print(f"Best alpha (RidgeCV): {lasso_cv.alpha_}")
print(f"Test R² score: {r2:.4f}")
print(f"Test MSE: {mse:.2f}")


Best alpha (RidgeCV): 0.0004498432668969444
Test R² score: 0.7928
Test MSE: 0.03


# Test set

In [41]:
# test

d_test = pd.read_csv('test.csv') 

## 2.1 convert types

In [42]:
d_test[['MSSubClass', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'YrSold']] = d_test[['MSSubClass', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'YrSold']].astype('object')

## 2.2 drop variables with most values missing in train

In [43]:
d_test.drop(columns=dropped_columns_most_missing, inplace=True)

## 2.3 Fill missing values with "Missing" for categorical variables

In [44]:
# Identify categorical columns
cat_cols = d_test.select_dtypes(include=['object', 'category']).columns

# Fill missing values with "Missing"
d_test[cat_cols] = d_test[cat_cols].fillna("Missing")

  d_test[cat_cols] = d_test[cat_cols].fillna("Missing")


## 2.4 Fill missing values with the mean for quantitative variables from the train

In [45]:
# Imputation dans le test set (numeric)
d_test.fillna(means, inplace=True)

## 2.5 create variables like the train

In [46]:
#'YearBuilt', 'YearRemodAdd, 'MoSold', 'YrSold' convert to quantitative
# List of columns to convert
columns_to_convert = ['YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold']

# Convert each to integer safely
for col in columns_to_convert:
    d_test.loc[:, col] = d_test[col].astype(int)

# Set reference year
reference_year = d_test['YrSold'].max()

# Create quantitative features safely
d_test.loc[:, 'HouseAge'] = reference_year - d_test['YearBuilt']
d_test.loc[:, 'YearsSinceRemod'] = reference_year - d_test['YearRemodAdd']


d_test = d_test.drop([
    'YearBuilt', 'YearRemodAdd', 'YrSold', 'MoSold'
], axis=1) 

# Convert object columns to numeric, forcing errors to NaN
cols_to_convert = ['HouseAge', 'YearsSinceRemod']
d_test[cols_to_convert] = d_test[cols_to_convert].apply(pd.to_numeric, errors='coerce')

## 2.6 Correct skewness

In [47]:
#case 1: only binary no log

#Most houses don't have low-quality finished square footage.
#Very few do, with highly varied amounts.
cols_to_binary_only_0 = ['BsmtHalfBath', 'EnclosedPorch', 'ScreenPorch']

for col in cols_to_binary_only_0:
    d_test[f'Has{col}'] = (d_test[col] > 0).astype(int)
    d_test.drop(columns=[col], inplace=True)

# = 1 or not

d_test['HasKitchen'] = (d_test['KitchenAbvGr'] == 1).astype(int)
d_test.drop(columns=['KitchenAbvGr'], inplace=True)


#case 2: binary+log

cols = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '2ndFlrSF', 'WoodDeckSF', 'OpenPorchSF']  # Replace with actual column names

for col in cols:
    d_test[f'Has{col}'] = (d_test[col] > 0).astype(int)
    d_test[f'{col}_log'] = np.log1p(d_test[col])

#case 3: just drop 
d_test.drop(columns=['LowQualFinSF','3SsnPorch', 'PoolArea', 'MiscVal'], inplace=True)

# case 4: log only

# List of variables to transform
vars_to_log = ['LotFrontage', 'LotArea', 'TotalBsmtSF', 'GrLivArea',  'BsmtUnfSF', '1stFlrSF']


# Create log-transformed versions with "_log" suffix
for col in vars_to_log:
    d_test[col + '_log'] = np.log1p(d_test[col])  # log1p handles zero safely

# Drop the original columns
d_test.drop(columns=vars_to_log, inplace=True)

## 2.7 Cap outliers

In [48]:
num_features = [col for col in num_features if col != "SalePrice_log"]

for col in num_features:
    q_low, q_high = caps[col]
    d_test[col] = d_test[col].clip(lower=q_low, upper=q_high)


## 2.8 Categorical variable (merge rare categories into the most frequent in train)

In [49]:
def apply_cat_mapping_to_test(df_test, mappings):
    df_test = df_test.copy()

    for col, info in mappings.items():
        df_test[col] = df_test[col].fillna("Missing")

        # Replace rare categories with 'Other'
        df_test[col] = df_test[col].apply(lambda x: 'Other' if x in info['group_rare'] else x)

        # Merge 'Other' into target if needed
        if info['final_merge_target'] is not None:
            df_test[col] = df_test[col].replace('Other', info['final_merge_target'])

    return df_test
# Apply same mappings to test
d_test = apply_cat_mapping_to_test(d_test, cat_mappings)



# Step 2: Drop same columns from test set
d_test = d_test.drop(columns=cols_dropped_one_cat, errors='ignore')

# 1.9 predict

In [64]:
# One-hot encode test set
X_test = pd.get_dummies(d_test, drop_first=True)
X_test = d_test.drop(columns=['Id'], errors='ignore') 
# Align with training columns (very important!)
X_test_aligned = X_test.reindex(columns=X_train.columns, fill_value=0)


y_pred_log = model.predict(X_test_aligned)
y_pred = np.expm1(y_pred_log)  # reverse np.log1p()
d_test['SalePrice'] = y_pred
d_test[['Id', 'SalePrice']].to_csv("predictions.csv", index=False)

### 1.9.1 Prediction with feature selection

In [50]:
# One-hot encode test set
X_test = pd.get_dummies(d_test, drop_first=True)
X_test = d_test.drop(columns=['Id'], errors='ignore') 
# Align with training columns (very important!)
X_test_aligned = X_test.reindex(columns=X_train.columns, fill_value=0)

X_test_sel = selector.transform(X_test_aligned)

y_pred_log = ridge_cv.predict(X_test_sel)
y_pred = np.expm1(y_pred_log)  # reverse np.log1p()
d_test['SalePrice'] = y_pred
d_test[['Id', 'SalePrice']].to_csv("predictions.csv", index=False)



In [51]:
import os
os.getcwd()

'C:\\Users\\berra\\House Prices'