<a href="https://colab.research.google.com/github/rsanchezgarc/AI-ML-analytics-IE/blob/main/notebooks/2_ML_Refresher/house_price_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The first cell is included to automatically import the data from kaggle. But requires configuration, so you might prefer to skip it, and download the data manually.

In [3]:
import os

os.environ["KAGGLE_USERNAME"] = "rsancg00"
os.environ["KAGGLE_KEY"] = "KGAT_39b4b30d8723b7a880ab8d31c3f836f6"

!kaggle competitions download -c house-prices-advanced-regression-techniques
!unzip house-prices-advanced-regression-techniques.zip

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 280MB/s]
Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

KeyboardInterrupt: 

In [None]:
train_path = 'train.csv'
test_path = 'test.csv'
data_description_path = 'data_description.txt'
sample_submission_path = 'sample_submission.csv'

# Load the train dataset
data = pd.read_csv(train_path)

# Display the first few rows
data.head()


In [None]:
missing_values = data.isnull().sum().sort_values(ascending=False)
print("Missing values per column:")
print(missing_values[missing_values > 0])

# Visualize missing values
plt.figure(figsize=(12, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.show()

print("N elements ", len(data))
# Display summary statistics
print(data.describe(include='all').T)
#Remove columns with more than 400 missing NaNs
good_colum_names = [data.columns[i] for i,count in enumerate(missing_values) if count<400]
data = data[good_colum_names]

In [None]:
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Separate numerical and categorical columns
num_cols = data.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data.select_dtypes(include=['object']).columns

# Impute missing values
data[num_cols] = num_imputer.fit_transform(data[num_cols])
data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])

# Verify no missing values remain
print("Remaining missing values per column:")
print(data.isnull().sum().sum())


In [None]:
# Apply label encoding to categorical columns
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Display encoded data sample
data.head()

In [None]:
target = 'SalePrice'
X = data.drop(columns=[target])
y = data[target]

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
print("Training data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)


There is something important that I am not doing here. Feature scaling. I am not doing it because random forests can handle different scales for different features, but in things like a linear model or a deep learning model, feature scaling (aka, input normalization), is critical

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42, n_estimators=100)

# Train the model
rf.fit(X_train, y_train)

# Make validation predictions
y_pred = rf.predict(X_val)

# Evaluate performance
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Plot true vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_val, y_pred, alpha=0.7)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values")
plt.show()


In [None]:
# Get feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display feature importances
print("Feature Importances:")
print(feature_importances)

# Plot feature importances
plt.figure(figsize=(10, 8))
plt.barh(feature_importances['Feature'], feature_importances['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.show()


In [None]:
"""Id,SalePrice
1461,169000.1
1462,187724.1233
1463,175221
etc.
"""

def predict_test_set(model):
    test_data = pd.read_csv(test_path)
    ids = test_data["Id"]
    test_data[target] = 0 #Adding the target colum because the inputers where fitted with it
    test_data = test_data[good_colum_names] #good_colum_names used in the training process

    print(num_imputer.feature_names_in_)
    test_data[num_cols] = num_imputer.transform(test_data[num_cols])
    test_data[cat_cols] = cat_imputer.transform(test_data[cat_cols])
    for col in cat_cols:
        le = label_encoders[col]
        test_data[col] = le.transform(test_data[col])
    del test_data[target]

    # Make predictions on the test data
    test_predictions = model.predict(test_data)
    # Create a submission DataFrame
    submission_df = pd.DataFrame({'Id': ids.astype(np.int64), 'SalePrice': test_predictions})

    # Save the submission DataFrame to a csv file
    submission_df.to_csv('submission.csv', index=False)

    # Display the first few rows of the submission file
    print(submission_df.head().to_markdown(index=False, numalign="left", stralign="left"))

predict_test_set(rf)

Since there is people that asked, I also added an example of anomaly/outliers detection using a non supervised method, the isolation forest. In many cases,it makes sense to remove outliers before training a supervised model. It is much better if the outlier removal method and the supervised method are based on different principles.

In [None]:
#Outlier detection with isolation forest

from sklearn.ensemble import IsolationForest
import numpy as np
import pandas as pd

# Assuming X_train and y_train are your feature and target sets

# Combine X_train and y_train into a single DataFrame for easier processing
data = pd.concat([X_train, y_train], axis=1)

# Initialize the Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # Set contamination to 5% for example

# Fit the model to the data (X_train + y_train)
iso_forest.fit(data)

# Get the predictions (-1 indicates outliers, 1 indicates inliers)
predictions = iso_forest.predict(data)

# Keep only the inliers (where prediction is 1)
inliers = predictions == 1

# Filter out the outliers from both X_train and y_train
X_train_filtered = X_train[inliers]
y_train_filtered = y_train[inliers]


In [None]:
#Outlier detection with kNN

from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

# Combine X_train and y_train into a single DataFrame for easier processing
data = pd.concat([X_train, y_train], axis=1)

# Initialize the kNN model (you can adjust the number of neighbors)
knn = NearestNeighbors(n_neighbors=5)  # Using 5 nearest neighbors, for example

# Fit the model to the data
knn.fit(data)

# Get the distances and indices of the nearest neighbors
distances, indices = knn.kneighbors(data)

# Calculate the mean distance to the nearest neighbors for each data point
mean_distances = np.mean(distances, axis=1)

# Set a threshold for the maximum acceptable distance to consider a point as an outlier
threshold = np.percentile(mean_distances, 95)  # For example, consider points in the top 5% of distances as outliers

# Identify outliers: if mean distance is greater than threshold, it's an outlier
outliers = mean_distances > threshold

# Filter out the outliers from both X_train and y_train
X_train_filtered = X_train[~outliers]
y_train_filtered = y_train[~outliers]
print(len(y_train_filtered), len(y_train))

You can now try if using X_train_filtered is a good idea or not

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42, n_estimators=100)

# Train the model
rf.fit(X_train_filtered, y_train_filtered)

# Make predictions
y_pred = rf.predict(X_val)

# Evaluate performance
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Plot true vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_val, y_pred, alpha=0.7)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values")
plt.show()


This is an example of how to do automatic hyperparameter tuning

In [None]:
!pip install optuna

In [None]:
import optuna
from sklearn.ensemble import RandomForestRegressor  # Use RandomForestClassifier if it's a classification problem
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_regression  # Use your own X_train and y_train here
from sklearn.metrics import mean_squared_error
import numpy as np


# Define the objective function for Optuna. This will contain the full training
# and scoring of the validation set process.
def objective(trial):
    # Hyperparameter search space
    param_grid = {
        'n_estimators': 200,  # Number of trees in the forest
        'max_depth': trial.suggest_int('max_depth', 2, 50),  # Maximum depth of trees
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),  # Minimum samples at a leaf node
        'max_features': trial.suggest_categorical('max_features', [1.0, 'sqrt', 'log2']),  # Maximum number of features to consider,
    }

    # Initialize the RandomForestRegressor with the suggested hyperparameters
    model = RandomForestRegressor(**param_grid, random_state=42)

    # Cross-validation score (using negative mean squared error as an example)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print(score)
    # We return the negative mean of the cross-validation score as the objective to minimize
    return -score.mean()

# Create a study object to optimize the objective function
study = optuna.create_study(direction='minimize')  # Minimize negative mean squared error
study.optimize(objective, n_trials=100)  # You can adjust n_trials based on your computational budget

# Output the best hyperparameters found
print(f"Best hyperparameters: {study.best_params}")

# You can now use the best parameters to train your final model:
best_params = study.best_params
best_rf_model = RandomForestRegressor(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

# Make predictions
y_pred = best_rf_model.predict(X_val)

# Evaluate performance
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

