# Setup

In [1]:
from snowflake.snowpark import Session
import modin.pandas as pd
import snowflake.snowpark.modin.plugin
import numpy as np

session = Session.builder.getOrCreate()

# Visualizing and Predicting House Price with plotly and scikit-learn

In [2]:
np.random.seed(42)

data = {
    'square_feet': np.random.randint(500, 1000, size=100),
    'num_rooms': np.random.randint(2, 5, size=100),
    'price': np.random.randint(100000, 500000, size=100)
}

df = pd.DataFrame(data)


In [3]:
import plotly.express as px
fig = px.scatter(df, x='square_feet', y='price', color='num_rooms', 
                 size='num_rooms', title="House Price vs Square Feet",
                 labels={'square_feet': 'Square Footage', 'price': 'Price ($)', 'num_rooms': 'Number of Rooms'})
fig.show()

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = df[['square_feet', 'num_rooms']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [7]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

Root Mean Squared Error (RMSE): 114876.69


In [8]:
results_df = pd.DataFrame({
    'True Price': y_test,
    'Predicted Price': y_pred
})

fig2 = px.scatter(results_df, x='True Price', y='Predicted Price', 
                  title="True Price vs Predicted Price", 
                  labels={'True Price': 'True Price ($)', 'Predicted Price': 'Predicted Price ($)'},
                 )

fig2.add_shape(
    type='line',
    x0=results_df['True Price'].min(), x1=results_df['True Price'].max(),
    y0=results_df['True Price'].min(), y1=results_df['True Price'].max(),
    line=dict(color='Red', dash='dash')
)

fig2.show()

The current operation leads to materialization and can be slow if the data is large!
