<a href="https://colab.research.google.com/github/samirasonfack/Kaggle-Competition-for-House-Pricing/blob/main/projet_ML_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

class FeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):

        self.ordinal_encoder = None  # Will be defined after fit()

        # Define ordinal features
        self.ordinal_features = [
            'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2',
            'BsmtExposure', 'KitchenQual', 'HeatingQC', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
        ]

        # Mappings for ordinal encoding
        self.ordinal_mappings = {
            'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'BsmtQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'BsmtCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'BsmtExposure': ['NA', 'No', 'Mn', 'Av', 'Gd'],
            'BsmtFinType1': ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
            'BsmtFinType2': ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
            'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'FireplaceQu': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'GarageQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'GarageCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'PoolQC': ['NA', 'Fa', 'TA', 'Gd', 'Ex']
        }

    def fit(self, X, y=None):
        return self

        # Add a transform method to apply the transformations
    def transform(self, X):
        # Create a copy of the input DataFrame to avoid modifying the original
        X_transformed = X.copy()

        # Apply strip to object columns
        for col in X_transformed.select_dtypes(include=['object']).columns:
            X_transformed[col] = X_transformed[col].str.strip()

        # Convert other columns to numeric
        for col in X_transformed.select_dtypes(exclude=['object']).columns:
            X_transformed[col] = pd.to_numeric(X_transformed[col], errors='coerce')

        return X_transformed


# Define preprocessing pipeline for the dataset
def preprocess_pipeline(df , feature_transformer):
    # Define the ordinal features and non-ordinal (nominal) features
    ordinal_features = [col for col in df.columns if col in FeatureTransformer().ordinal_features]
    nominal_features = [col for col in df.select_dtypes(include=['object']).columns if col not in ordinal_features]
    numerical_features = [col for col in df.select_dtypes(include=['number']).columns if col not in  ['SalePrice', 'Id']]

    # Handle missing values and encoding transformations
    preprocessor = ColumnTransformer(
        transformers=[
            ('ordinal', Pipeline([


                ('ordinal_encoder', OrdinalEncoder(categories=[feature_transformer.ordinal_mappings[col]
                                                              for col in ordinal_features if col in X_train.columns],
                                                   handle_unknown='use_encoded_value', unknown_value=-1))
            ]), ordinal_features),

            ('nominal', Pipeline([

                ('onehot', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'))
            ]), nominal_features),

            ('numeric', Pipeline([
                ('imputer', SimpleImputer(strategy='median'))
            ]), numerical_features)
        ]
    )

    return preprocessor


In [None]:
def clean(df):

  for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(str).str.strip()
    else:
        df[col] = pd.to_numeric(df[col], errors='coerce')


  ordinal_features = [
      'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2',
      'BsmtExposure', 'KitchenQual', 'HeatingQC', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
  ]

  # Assume these are your nominal features (categorical, non-ordinal)
  nominal_features = [
      'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
      'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
      'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
      'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating',
      'CentralAir', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
      'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'
  ]


  # Get the numeric columns by excluding ordinal and nominal features
  numeric_columns = [col for col in df.columns
                    if col not in ordinal_features and col not in nominal_features and col not in ['Id','SalePrice']]

  for col in numeric_columns:
      df[col] = df[col].replace('NA', np.nan)
      df[col] = pd.to_numeric(df[col], errors='coerce')

  return df



In [None]:
from sklearn.model_selection import GridSearchCV

# Chargement des données
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()



# Séparation des features et de la cible
target = "SalePrice"
id_col = "Id"
X_train = train.drop(['Id' , 'SalePrice'] , axis=1)
y_train = np.log(train[target])
X_test = test.drop(columns=[id_col] , axis=1)




In [None]:

X_train = clean(X_train)
X_test = clean(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

feature_transformer = FeatureTransformer()
X_train = feature_transformer.transform(X_train)
X_test = feature_transformer.transform(X_test)

preprocessor = preprocess_pipeline(X_train, feature_transformer)


model = Pipeline([
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(n_estimators=100, random_state=42))
])

param_grid = {
    'random_forest__n_estimators': [500],
    'random_forest__max_depth': [10,20],
    'random_forest__min_samples_split': [2, 5],
    'random_forest__max_features': ['sqrt']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)


best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

y_pred = np.exp(best_model.predict(X_test))

Best Parameters: {'random_forest__max_depth': 20, 'random_forest__max_features': 'sqrt', 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 500}




In [None]:
# Save predictions
submission = pd.DataFrame({id_col: test[id_col], target: y_pred})
submission.to_csv("submission.csv", index=False)

print("✅ Préprocessing + entraînement Random Forest avec GridSearchCV terminé. Fichier 'submission.csv' généré !")

# Evaluate performance on training data
y_train_pred = best_model.predict(X_train)

# Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"RMSE on training data: {rmse:.4f}")

# Calculate R² (Coefficient of Determination)
r2 = r2_score(y_train, y_train_pred)
print(f"R² on training data: {r2:.4f}")

✅ Préprocessing + entraînement Random Forest avec GridSearchCV terminé. Fichier 'submission.csv' généré !
RMSE on training data: 0.0526
R² on training data: 0.9827
