<a href="https://colab.research.google.com/github/sainath5466/asdf/blob/main/notebooks/MLP%20regression%20model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qq yellowbrick

In [2]:
# some builtin imports
import re
import warnings
from collections import Counter
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# Some usual imports here
import csv
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl


# sklearn models
from sklearn import metrics
from sklearn.neural_network import MLPRegressor
from sklearn import metrics, model_selection

# visualizations
import shap
from yellowbrick.regressor import residuals_plot, prediction_error

In [3]:
## Customize Matplotlib Parameters
%matplotlib inline
mpl.rcParams['figure.dpi']= 120
mpl.rcParams['figure.edgecolor']= 'black'
mpl.rcParams['axes.linewidth']= .5
# Customize Seaborn Parameters
sns.set()
rc = {
      'font.family': ['serif'],
      'font.serif':'Times New Roman',
      'grid.color': 'gainsboro',
      'grid.linestyle': '-',
}
sns.set_style(rc=rc)
sns.set_context("notebook", font_scale=0.8)

# Load dataset

In [4]:
df = pd.read_csv('house_rentals.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4957 entries, 0 to 4956
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   url            4957 non-null   object 
 1   listing_type   4957 non-null   object 
 2   name           4957 non-null   object 
 3   price          4957 non-null   int64  
 4   category       4957 non-null   object 
 5   bedrooms       4957 non-null   float64
 6   bathrooms      4957 non-null   float64
 7   floor_area     4957 non-null   float64
 8   location       4957 non-null   object 
 9   condition      4957 non-null   object 
 10  amenities      4957 non-null   object 
 11  region         4957 non-null   object 
 12  locality       4957 non-null   object 
 13  parking_space  4956 non-null   object 
 14  is_furnished   4956 non-null   object 
 15  lat            4956 non-null   float64
 16  lng            4956 non-null   float64
dtypes: float64(5), int64(1), object(11)
memory usage: 65

# Multilayer Perceptron Regressor

In [6]:
def compute_metrics(model, x, y, cv=5):
    preds = model.predict(x)
    score = model.score(x, y)

    ms = {
        "R2": round(score, 3),
        "mse": round(metrics.mean_squared_error(y, preds), 3),
        "rmse": round(np.sqrt(metrics.mean_squared_error(y, preds)), 3),
        "mae": round(metrics.mean_absolute_error(y, preds), 3),
        "adjusted_r2": round(1 - (1 - score) * (len(y) - 1) / (len(y) - x.shape[1] - 1), 3),
    }
    if cv is not None:
        scores_cvs = model_selection.cross_val_score(model, x, y, scoring='r2', cv=cv)
        ms['cv_score'] = round(scores_cvs.mean()*100, 2)

    return pd.DataFrame([ms])

In [8]:
import numpy as np
import pandas as pd
from sklearn import model_selection

# ... (rest of your code)

# train-test split for model evaluation
if 'log1p_price' not in df.columns:  # Check if 'log1p_price' column exists
    df['log1p_price'] = np.log1p(df['price'])  # Create 'log1p_price' column

X = df.drop(['price', 'log1p_price'], axis=1).values
y = df['log1p_price'].values

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, train_size=0.8, shuffle=True, random_state=42
)

# ... (rest of your code)

In [9]:
X_train.shape, X_test.shape

((3965, 16), (992, 16))

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df is your DataFrame

# 1. Identify numerical and categorical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.drop(['price', 'log1p_price'])
categorical_features = df.select_dtypes(include=['object']).columns

# 2. Function to handle URLs in categorical features
def handle_urls(df, categorical_features):
    for feature in categorical_features:
        # Check if the feature contains URLs and is of type object (string)
        if df[feature].dtype == 'object':
            # Convert the column to string type before applying .str methods
            df[feature] = df[feature].astype(str)

            # Replace URLs with a placeholder or extract relevant information
            df[feature] = df[feature].str.replace('http\S+', 'URL_PLACEHOLDER', regex=True)
    return df

# 3. Handle URLs in your DataFrame
df = handle_urls(df, categorical_features)

# 4. Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ])

# 5. Create a pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor(random_state=17, max_iter=500)),
])

# 6. Split data into training and testing sets
X = df.drop(['price', 'log1p_price'], axis=1)
y = df['log1p_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

# 7. Fit the pipeline
pipeline.fit(X_train, y_train)

# 8. Make predictions
predictions = pipeline.predict(X_test)

ValueError: Input X contains NaN.
MLPRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### Training metrics


In [None]:
train_metrics = compute_metrics(model, X_train, y_train)
train_metrics

### Validation metrics


In [None]:
val_metrics = compute_metrics(model, X_test, y_test)
val_metrics

### Residual plots

In [None]:
visualizer = residuals_plot(
    model, X_train, y_train, X_test, y_test,
)

In [None]:
visualizer = residuals_plot(
    model, X_train, y_train, X_test, y_test, hist=False, qqplot=True
)

The QQ-Plot shows the residuals are normally distributed, because their quantiles when plotted against quantiles of normal distribution forms a straight line.

### Prediction Error

In [None]:
visualizer = prediction_error(model, X_train, y_train, X_test, y_test)

## Actual Predictions

In [None]:
preds = model.predict(X_test)

In [None]:
# Reverse the log transformation
actual_preds = np.expm1(preds)
actual_y_test = np.expm1(y_test)

# Compute metrics on the original scale
def compute_metrics_original_scale(y_true, y_pred):
    return pd.DataFrame([{
        "R2": round(metrics.r2_score(y_true, y_pred), 3),
        "mse": round(metrics.mean_squared_error(y_true, y_pred), 3),
        "rmse": round(np.sqrt(metrics.mean_squared_error(y_true, y_pred)), 3),
        "mae": round(metrics.mean_absolute_error(y_true, y_pred), 3),
    }])

val_metrics_original_scale = compute_metrics_original_scale(actual_y_test, actual_preds)
val_metrics_original_scale

In [None]:
pred_df = pd.DataFrame({"actual": actual_y_test, "pred": actual_preds})
pred_df.head(10)

In [None]:
plt.scatter(pred_df.actual, pred_df.pred, c='crimson')
p1 = max(max(pred_df.actual), max(pred_df.pred))
p2 = min(min(pred_df.actual), min(pred_df.pred))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('Actual Values', fontsize=15)
plt.ylabel('Predicted Values', fontsize=15)
plt.axis('equal')
plt.show()

In [None]:
sns.distplot((pred_df.actual-pred_df.pred))
plt.title('Distribution of residuals')
plt.xlabel("Residuals")
plt.show()