<p style="text-align:center">
    <a href="https://skills.network" target="_blank">
    <img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/assets/logos/SN_web_lightmode.png" width="300" alt="Skills Network Logo">
    </a>
</p>


# Test Environment for Generative AI classroom labs

This lab provides a test environment for the codes generated using the Generative AI classroom.

Follow the instructions below to set up this environment for further use.


# Setup


### Install required libraries

In case of a requirement of installing certain python libraries for use in your task, you may do so as shown below.


In [1]:
%pip install seaborn
import piplite

await piplite.install(['nbformat', 'plotly'])

### Dataset URL from the GenAI lab
Use the URL provided in the GenAI lab in the cell below. 


In [1]:
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-AI0271EN-SkillsNetwork/labs/v1/m3/data/used_car_price_analysis.csv"

### Downloading the dataset

Execute the following code to download the dataset in to the interface.

> Please note that this step is essential in JupyterLite. If you are using a downloaded version of this notebook and running it on JupyterLabs, then you can skip this step and directly use the URL in pandas.read_csv() function to read the dataset as a dataframe


In [4]:
from pyodide.http import pyfetch

async def download(url, filename):
    response = await pyfetch(url)
    if response.status == 200:
        with open(filename, "wb") as f:
            f.write(await response.bytes())

path = URL

await download(path, "dataset.csv")
file_name  = "dataset.csv"

---


# Test Environment


In [14]:
# Keep appending the code generated to this cell, or add more cells below this to execute in parts
# Install dependencies as needed:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


# Read data from CSV into a DataFrame
# Replace 'data.csv' with your actual file path
df = pd.read_csv('dataset.csv')

# Remove duplicate rows
df = df.drop_duplicates()

# Fill missing values
# Numeric columns: fill with the column mean
numeric_cols = df.select_dtypes(include=['number']).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

# Categorical/object columns: fill with the most frequent value (mode)
object_cols = df.select_dtypes(include=['object']).columns
for col in object_cols:
    df[col] = df[col].fillna(df[col].mode().iloc[0])

# Output the cleaned DataFrame (optional)
print(df)

 # Determine the fuel type column using common names
fuel_col = None
for candidate in ['fuel_type', 'fuel', 'fueltype', 'fuel_type_desc']:
        if candidate in df.columns:
            fuel_col = candidate
            break

 # Fallback: find any column that contains the substring 'fuel'
if fuel_col is None:
       for col in df.columns:
           if 'fuel' in col.lower():
                fuel_col = col
                break

    # Count the number of sales per fuel type
sales_per_fuel = df[fuel_col].value_counts().reset_index()
sales_per_fuel.columns = ['fuel_type', 'sales_count']
print(sales_per_fuel)

# Detect price column from common names, otherwise pick the first column
price_col = None
for cand in ['price', 'price_usd', 'price_currency', 'price_amount']:
       if cand in df.columns:
            price_col = cand
            break
if price_col is None:
       price_col = df.columns[0]

path = 'dataset.csv'  # e.g., 'housing.csv'
compare_results = compare_models_on_dataset(
file_path=path,
target_col='target',  # change to your actual target column
degree=2,
test_size=0.2,
random_state=42
)
    # Optionally inspect raw results in the console
print("\nSummary:")
for r in compare_results:
      print(r)


def compare_models_on_dataset(
    file_path: str,
    target_col: str = 'target',
    degree: int = 2,
    test_size: float = 0.2,
    random_state: int = 42
) -> list:
    """Compare Linear, Polynomial, and Ridge regression models.

    - Evaluates models on a single feature (the first numeric feature) and
      on multiple features (all numeric features).
    - Returns a list of result dictionaries with RMSE and R^2 for each model.
    
    Parameters:
      file_path (str): Path to a CSV file containing the dataset.
      target_col (str): Name of the numeric target column to predict.
      degree (int): Degree for the polynomial features.
      test_size (float): Proportion of data to use as the test set.
      random_state (int): Seed for reproducibility.
    """

    # Load data from CSV
    df = pd.read_csv(file_path)

    # Identify numeric feature columns, excluding the target column
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
    if target_col in numeric_cols:
        numeric_cols.remove(target_col)

    # If no numeric features, nothing to compare
    if not numeric_cols:
        print("No numeric feature columns found for modeling.")
        return []

    # Separate features and target
    X_all = df[numeric_cols]
    y = df[target_col]

    # Split data into training and testing sets (same split for all experiments)
    X_train_all, X_test_all, y_train, y_test = train_test_split(
        X_all, y, test_size=test_size, random_state=random_state
    )

    results = []  # collect results for all configurations

    # Evaluate two feature settings: single feature vs all features
    for mode, feature_cols in [('single', [numeric_cols[0]]), ('multiple', numeric_cols)]:
        X_train = X_train_all[feature_cols]
        X_test = X_test_all[feature_cols]

        # Define three model families
        models = {
            'LinearRegression': LinearRegression(),
            'PolynomialDegree{}__SingleOrAll'.format(degree): Pipeline([
                ('poly', PolynomialFeatures(degree=degree, include_bias=False)),
                ('linear', LinearRegression())
            ]),
            'Ridge': Ridge(alpha=1.0)
        }

        for model_name, model in models.items():
            # Fit model on training data
            model.fit(X_train, y_train)

            # Predict on test data
            y_pred = model.predict(X_test)

            # Evaluate performance
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            r2 = r2_score(y_test, y_pred)

            results.append({
                'mode': mode,
                'feature_set': 'single' if mode == 'single' else 'multiple',
                'model': model_name,
                'rmse': rmse,
                'r2': r2
            })

    # Print a compact summary
    for r in results:
        print(
            "Mode: {mode}, Feature set: {feat}, Model: {m}, RMSE: {rmse:.4f}, R2: {r2:.4f}".format(
                mode=r['mode'], feat=r['feature_set'], m=r['model'], rmse=r['rmse'], r2=r['r2']
            )
        )

    return results



    

         model  year transmission  mileage fuelType    tax   mpg  engineSize  \
0       Fiesta  2017    Automatic    15944   Petrol  150.0  57.7         1.0   
1        Focus  2018       Manual     9083   Petrol  150.0  57.7         1.0   
2        Focus  2017       Manual    12456   Petrol  150.0  57.7         1.0   
3       Fiesta  2019       Manual    10460   Petrol  145.0  40.3         1.5   
4       Fiesta  2019    Automatic     1482   Petrol  145.0  48.7         1.0   
...        ...   ...          ...      ...      ...    ...   ...         ...   
17961    B-MAX  2017       Manual    16700   Petrol  150.0  47.1         1.4   
17962    B-MAX  2014       Manual    40700   Petrol   30.0  57.7         1.0   
17963    Focus  2015       Manual     7010   Diesel   20.0  67.3         1.6   
17964       KA  2018       Manual     5007   Petrol  145.0  57.7         1.2   
17965    Focus  2015       Manual     5007   Petrol   22.0  57.7         1.0   

       price  
0      12000  
1      14

<class 'NameError'>: name 'compare_models_on_dataset' is not defined

[Abhishek Gagneja](https://www.linkedin.com/in/abhishek-gagneja-23051987/)


## Change Log


|Date (YYYY-MM-DD)|Version|Changed By|Change Description|
|-|-|-|-|
|2023-12-10|0.1|Abhishek Gagneja|Initial Draft created|


Copyright Â© 2023 IBM Corporation. All rights reserved.
