In [None]:
!wget https://github.com/mrdbourke/zero-to-mastery-ml/raw/master/data/bluebook-for-bulldozers.zip # download files from GitHub as zip

import os
import zipfile

local_zip = 'bluebook-for-bulldozers.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')

zip_ref.extractall('.') # extract all data into current working directory
zip_ref.close()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/content/bluebook-for-bulldozers/TrainAndValid.csv")

In [None]:
df.head(3)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(df["saledate"][:1000], df["SalePrice"][:1000])

In [None]:
df["saledate"][:1000]

In [None]:
df["SalePrice"].plot.hist()

##Since we are dealing with time series dataset, we need to organize the dates as much as possible, we can do that by telling pandas which of our columns has dates in it using the 'parse_dates' parameter


In [None]:
#Import data again but this time parse dates

df = pd.read_csv("/content/bluebook-for-bulldozers/TrainAndValid.csv", parse_dates=["saledate"])

In [None]:
df.saledate

In [None]:
fig, ax = plt.subplots()
ax.scatter(df["saledate"][:1000], df["SalePrice"][:1000])

#Now the index is properly organzied according to Year on X axis

In [None]:
df.head().T

In [None]:
df.head().T

In [None]:
df.saledate.head(20)

## When working with time series data, it's a good practice to sort the data by date

In [None]:
df.sort_values(by=["saledate"], inplace=True, ascending=True)

In [None]:
df.saledate.head(20)

In [None]:
df.head()

In [None]:
df_tmp = df.copy()

## Add datetime parameters for `saledate` column

In [None]:
df_tmp["saleYear"] = df_tmp.saledate.dt.year
df_tmp["saleMonth"] = df_tmp.saledate.dt.month
df_tmp["saleDay"]= df_tmp.saledate.dt.day
df_tmp["saleDayOfWeek"] = df_tmp.saledate.dt.dayofweek
df_tmp["saleDayOfYear"] = df_tmp.saledate.dt.dayofyear

In [None]:
df_tmp.head().T

In [None]:
df_tmp.drop(columns=["saledate"], inplace=True)

In [None]:
df_tmp.state.value_counts()

In [None]:
df_tmp.head().T

## Now let's do some model-drive EDA


---



##- Handling Missing Values/Duplicates
##- Feature Transformation
##- Feature Encoding
##- Converting Categorical cols to Numerical
##- Feature Scaling (if required)

In [None]:
df_tmp.info()

In [None]:
# 1. Select only the columns that have the 'object' dtype
#    .select_dtypes() returns a new DataFrame containing only those columns
df_categorical_subset = df_tmp.select_dtypes(include=['object'])

# 2. Extract the column names as a list
categorical_column_names = df_categorical_subset.columns

# 3. Convert only those specific columns in the original DataFrame in place
#    by applying .astype('category') to the selection
df_tmp[categorical_column_names] = df_tmp[categorical_column_names].astype('category')

In [None]:
df_tmp.info()

- Once data is stored using the pandas category data type, we have a way to access that data in its underlying numeric format.

- The category dtype is designed specifically to optimize storage by mapping string labels to a compact, integer-based representation internally.

- You can access these underlying integer values using the .cat.codes attribute of the pandas Series:

In [None]:
df_tmp.state.cat.codes

In [None]:
##Save preprocessed data

df_tmp.to_csv("preprocessed_data.csv", index=False)



---



---



---



In [None]:
#Identify and deal with missing values

df_tmp.isnull().sum()/len(df_tmp)

In [None]:
df_tmp.isnull().sum()

In [None]:
##Fill numeric missing values first

for label, content in df_tmp.items():
  if pd.api.types.is_numeric_dtype(content):
    print(label)

In [None]:
# Check for which numeric cols have null values

for label, content in df_tmp.items():
  if pd.api.types.is_numeric_dtype(content):
    if pd.isnull(content).sum():
      print(label)

In [None]:
# Fill them with median

for label, content in df_tmp.items():
  if pd.api.types.is_numeric_dtype(content):
    if pd.isnull(content).sum():
      #Add a binary column which tells us if the data was missing
      df_tmp[label+"_is_missing"] = pd.isnull(content)

      df_tmp[label] = content.fillna(content.median())

In [None]:
#Check to see how many examples were missing

df_tmp.auctioneerID_is_missing.value_counts()

In [None]:
df_tmp.isnull().sum()



---



---

#Filling and turning categorical variables into numbers

In [None]:
for label, content in df_tmp.items():
  if not pd.api.types.is_numeric_dtype(content):
    print(label)

In [None]:
 for label, content in df_tmp.items():
  if not pd.api.types.is_numeric_dtype(content):
      #Add a binary column which tells us if the sample had a missing value
      df_tmp[label+"_is_missing"] = pd.isnull(content)

      #Turn categories into numbers and add +1 since pandas assigns -1 to values that are missing
      df_tmp[label] = pd.Categorical(content).codes+1

In [None]:
df_tmp.info()

In [None]:
df_tmp.head().T[:50]

In [None]:
df_tmp.isnull().sum()

In [None]:
df_tmp["UsageBand"].value_counts()



---


---



---




#Model Building

In [None]:
len(df_tmp)

In [None]:
# X = df_tmp.drop(columns=["SalePrice"],axis =1)
# y = df_tmp["SalePrice"]

In [None]:
# %%time

# from sklearn.ensemble import RandomForestRegressor

# model = RandomForestRegressor(n_jobs=-1, random_state=42)

# model.fit(X, y)

In [None]:
# model.score(X,y) #leads to Overfitting, model memorizes the test data

##Splitting the data into training and validation sets

In [None]:
#this is how we usually do it in time series data

df_val = df_tmp[df_tmp['saleYear'] >= 2012]
df_train = df_tmp[df_tmp['saleYear'] != 2012]

len(df_train), len(df_val)

In [None]:
#Split data into X and y

X_train, y_train = df_train.drop("SalePrice",axis=1), df_train["SalePrice"]
X_val, y_val = df_val.drop("SalePrice",axis=1), df_val["SalePrice"]

## Creating our own Evaluation Function

In [None]:
from sklearn.metrics import root_mean_squared_log_error,mean_squared_log_error, mean_absolute_error,r2_score

def show_scores(model):
  train_preds = model.predict(X_train)
  val_preds = model.predict(X_val)

  scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
            "Validation MAE": mean_absolute_error(y_val, val_preds),
            "Training RMSLE": root_mean_squared_log_error(y_train, train_preds),
            "Validation RMSLE": root_mean_squared_log_error(y_val, val_preds),
            "Training R2 Score": r2_score(y_train, train_preds),
            "Validation R2 Score": r2_score(y_val, val_preds)
            }
  return scores

## Testing our model on a subset (to tune the hyperparameters)

In [None]:
%%time

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1, random_state=42, max_samples=10000)

model.fit(X_train, y_train)


In [None]:
show_scores(model)

##Hyperparameter Tuning with RandomizedSearchCV

In [None]:
%%time

from sklearn.model_selection import RandomizedSearchCV

#Different RandomForestRegressor hyperparameters

rf_grid = {
    "n_estimators": np.arange(10, 100, 10),
    "max_depth": [None,3,5,10],
    "min_samples_split": np.arange(2,20,2),
    "min_samples_leaf": np.arange(1,20,2),
    "max_features": [0.5,1,"sqrt","auto"],
    "max_samples": [10000]
}

rs_model = RandomizedSearchCV(model,
                              param_distributions=rf_grid,
                              n_iter=5,
                              cv = 5,
                              verbose=True)

rs_model.fit(X_train, y_train)

##Train a model with the best hyperparameters

- found after 100 iterations of `RandomizedSearchCV`

In [None]:
%%time

ideal_model = RandomForestRegressor(n_jobs=-1, random_state=42,
                                    min_samples_split=14,
                                    min_samples_leaf=1,
                                    n_estimators=40,
                                    max_features=0.5,
                                    max_samples = None)

ideal_model.fit(X_train, y_train)

In [None]:
show_scores(rs_model) #trained on only 10,000 samples

In [None]:
show_scores(ideal_model)#trained on the whole training dataset

In [None]:
import joblib

# Save the trained model to a file using joblib
joblib.dump(ideal_model, 'ideal_model.joblib')

print("Model saved successfully as 'ideal_model.joblib'.")

## Make predictions on the test data

## Preprocess the data (getting the test dataset in the same format as the training dataset)

In [None]:
df_test = pd.read_csv("/content/bluebook-for-bulldozers/Test.csv", parse_dates=["saledate"])
df_test.head()

In [None]:
def preprocess_data(df):
  """
  Performs transformations on df and returns the transformed df.
  """
  df["saleYear"] = df.saledate.dt.year
  df["saleMonth"] = df.saledate.dt.month
  df["saleDay"] = df.saledate.dt.day
  df["saleDayOfWeek"] = df.saledate.dt.dayofweek
  df["saleDayOfYear"] = df.saledate.dt.dayofyear

  df.drop("saledate", axis=1, inplace=True)

  # Fill the numeric rows with median and handle categorical columns
  for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
      if pd.isnull(content).sum():
        # Add a binary column which tells us if the data was missing
        df[label+"_is_missing"] = pd.isnull(content)
        # Replace the nan with the median
        df[label] = content.fillna(content.median())
    else:
      # For categorical columns, add a binary column for missing values
      df[label+"_is_missing"] = pd.isnull(content)
      # Turn categories into numbers and add +1 since pandas assigns -1 to values that are missing
      df[label] = pd.Categorical(content).codes+1

  return df

In [None]:
df_test = preprocess_data(df_test)

In [None]:
df_test.head()

In [None]:
missing_in_test_cols = set(X_train.columns) - set(df_test.columns)

for col in missing_in_test_cols:
    if col.endswith("_is_missing"):
        df_test[col] = False
    else:
        df_test[col] = 0

extra_in_test_cols = set(df_test.columns) - set(X_train.columns)
df_test.drop(columns=list(extra_in_test_cols), inplace=True)

df_test = df_test[X_train.columns]

In [None]:
test_preds = ideal_model.predict(df_test)

#Converting it into a dataframe in a format which Kaggle is asking for

In [None]:
df_preds = pd.DataFrame()
df_preds["SalesID"] = df_test["SalesID"]
df_preds["SalePrice"] = test_preds

In [None]:
df_preds

In [None]:
#export prediction data to csv to be submiited on Kaggle

df_preds.to_csv("test_predictions.csv", index = False)

In [None]:
import joblib

# Save the X_train columns to a file
joblib.dump(X_train.columns, 'X_train_columns.joblib')

print("X_train column names saved successfully as 'X_train_columns.joblib'.")

In [None]:
#Find Feature Importance of our best Model


# Task
Create a Streamlit application named `app.py`. This application should:
1. Load the `ideal_model.joblib` and `X_train_columns.joblib` files.
2. Define a `preprocess_data` function that transforms input data similarly to how the training data was processed.
3. Identify the top 5 most important features from the loaded model's feature importances.
4. Create a user interface using Streamlit to collect input for these top 5 features.
5. Process the user input, transform it using `preprocess_data`, and make a prediction using the loaded model.
6. Display the predicted 'SalePrice' to the user.

Finally, provide the complete `app.py` code and instructions on how to run it, assuming `ideal_model.joblib` and `X_train_columns.joblib` are in the same directory.

## Load Model and Feature Names

### Subtask:
Load the `ideal_model.joblib` and `X_train_columns.joblib` files within the `app.py` script. This will be crucial for making predictions and ensuring feature consistency.


**Reasoning**:
To load the saved model and feature names, I need to use the `joblib` library to deserialize the files. I will import joblib and then use `joblib.load()` for each file.



In [None]:
import joblib

# Load the trained machine learning model
ideal_model = joblib.load('ideal_model.joblib')

# Load the list of training column names
X_train_columns = joblib.load('X_train_columns.joblib')

print("Model and feature columns loaded successfully.")

**Reasoning**:
To proceed with the Streamlit application and meet the requirement of identifying the top 5 most important features, I need to access the `feature_importances_` attribute of the loaded `ideal_model` and associate them with their respective column names from `X_train_columns`. This will allow me to rank and select the most influential features.



In [None]:
feature_importances = ideal_model.feature_importances_
feature_names = X_train_columns

# Create a Series for easier sorting and identification
importance_df = pd.Series(feature_importances, index=feature_names)

# Get the top 5 most important features
top_5_features = importance_df.nlargest(5)

print("Top 5 most important features:")
print(top_5_features)

**Reasoning**:
To ensure that new input data is processed in the same way as the training data, I will recreate the `preprocess_data` function within the `app.py` script. This function will handle date feature engineering, fill numeric missing values with medians (and create `_is_missing` columns), and convert categorical features to numerical codes (also with `_is_missing` columns).



In [None]:
import pandas as pd
import numpy as np

def preprocess_data(df, X_train_columns):
    """
    Performs transformations on df to match the training data format.
    Assumes df has a 'saledate' column and needs feature engineering and missing value imputation.
    """

    # Feature engineering for saledate
    df["saleYear"] = df.saledate.dt.year
    df["saleMonth"] = df.saledate.dt.month
    df["saleDay"] = df.saledate.dt.day
    df["saleDayOfWeek"] = df.saledate.dt.dayofweek
    df["saleDayOfYear"] = df.saledate.dt.dayofyear
    df.drop("saledate", axis=1, inplace=True)

    # Fill numeric missing values and create _is_missing columns
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df[label+"_is_missing"] = pd.isnull(content)
                df[label] = content.fillna(content.median())
        else:
            # For categorical columns, add a binary column for missing values
            df[label+"_is_missing"] = pd.isnull(content)
            # Turn categories into numbers and add +1 since pandas assigns -1 to values that are missing
            df[label] = pd.Categorical(content).codes+1

    # Align columns with training data
    missing_in_test_cols = set(X_train_columns) - set(df.columns)
    for col in missing_in_test_cols:
        if col.endswith("_is_missing"):
            df[col] = False
        else:
            df[col] = 0

    extra_in_test_cols = set(df.columns) - set(X_train_columns)
    df.drop(columns=list(extra_in_test_cols), inplace=True)

    # Ensure the order of columns matches X_train_columns
    df = df[X_train_columns]

    return df

print("Preprocess data function defined.")