<a href="https://colab.research.google.com/github/seelamprem/IBM-Capstone-Final-Project-/blob/main/GenAImodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import argparse
import os

def plot_pivot_pcolor(df, index_col='GPU', column_col='CPU_core', value_col='Price', output_file=None):
    # Validate required columns
    if index_col not in df.columns or column_col not in df.columns or value_col not in df.columns:
        raise ValueError(f"DataFrame must contain columns: {index_col}, {column_col}, {value_col}")

    # Ensure numeric target
    df_tmp = df.copy()
    df_tmp[value_col] = pd.to_numeric(df_tmp[value_col], errors='coerce')

    # Create pivot table: rows -> index_col, cols -> column_col, values -> value_col (mean)
    pivot = df_tmp.pivot_table(index=index_col, columns=column_col, values=value_col, aggfunc='mean')

    if pivot.empty:
        raise ValueError("Pivot table is empty. No data available for the given grouping.")

    data = pivot.values
    fig_height = max(4, data.shape[0] * 0.5)
    fig_width = max(6, data.shape[1] * 1.0)
    fig, ax = plt.subplots(figsize=(fig_width, fig_height))

    # Use a masked array to handle NaNs gracefully
    masked = np.ma.masked_invalid(data)
    c = ax.pcolor(masked, cmap='viridis', edgecolors='k', linewidths=0.02)

    # Axes labels and ticks
    ax.set_xticks(np.arange(data.shape[1]) + 0.5)
    ax.set_yticks(np.arange(data.shape[0]) + 0.5)
    ax.set_xticklabels([str(x) for x in pivot.columns], rotation=45, ha='right')
    ax.set_yticklabels([str(y) for y in pivot.index])
    ax.set_xlabel(index_col)
    ax.set_ylabel(column_col)

    plt.colorbar(c, ax=ax, label=value_col)
    ax.set_title(f"Mean {value_col} by {index_col} and {column_col}")
    plt.tight_layout()

    if output_file:
        plt.savefig(output_file, dpi=300)
        plt.close()
        print(f"Pivot pcolor plot saved to: {output_file}")
    else:
        plt.show()

# Create a dummy CSV file for demonstration in Colab
dummy_csv_path = 'dummy_data.csv'
dummy_data = {
    'GPU': ['A100', 'A100', 'V100', 'V100', 'T4', 'T4', 'A100', 'V100'],
    'CPU_core': [8, 16, 8, 16, 4, 8, 8, 16],
    'Price': [1000, 1500, 800, 1200, 300, 500, 1100, 1300]
}
df_dummy = pd.DataFrame(dummy_data)
df_dummy.to_csv(dummy_csv_path, index=False)
print(f"Created dummy CSV: {dummy_csv_path}")

# Call the function directly with the dummy data
plot_pivot_pcolor(
    df_dummy,
    index_col='GPU',
    column_col='CPU_core',
    value_col='Price',
    output_file='pivot_pcolor_dummy.png' # Save to a file instead of showing
)

Created dummy CSV: dummy_data.csv
Pivot pcolor plot saved to: pivot_pcolor_dummy.png


In [None]:
%pip install seaborn
%pip install nbformat plotly



In [None]:
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv"

In [None]:
import urllib.request

path = URL
file_name  = "dataset.csv"

urllib.request.urlretrieve(path, file_name)
print(f"Downloaded {path} to {file_name}")

Downloaded https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv to dataset.csv


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(URL, header=0)

In [None]:
# Reads a CSV into a DataFrame, selects one feature as X and one as y, trains a simple linear regression, and computes MSE and R^2 on the training data.
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Configuration: adjust these for your data
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv" # CSV with header row
feature_col = "CPU_frequency"  # source variable (X)
target_col = "Price"    # target variable (y)
# Load data
df = pd.read_csv(URL, header=0)
# Prepare features and target
X = df[[feature_col]]
y = df[target_col]
# Train model
model = LinearRegression()
model.fit(X, y)
# Predict and evaluate
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
# Output results
print("MSE:", mse)
print("R^2:", r2)

MSE: 284583.4405868629
R^2: 0.1344436321024326


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Configuration: adjust these for your data
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv" # CSV with header row
feature_col = "RAM_GB"  # source variable (X)
target_col = "Price"    # target variable (y)
# Load data
df = pd.read_csv(URL, header=0)
# Prepare features and target
X = df[[feature_col]]
y = df[target_col]
# Train model
model = LinearRegression()
model.fit(X, y)
# Predict and evaluate
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
# Output results
print("MSE:", mse)
print("R^2:", r2)

MSE: 229582.71927482335
R^2: 0.3017275206956078


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Configuration: adjust these for your data
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv" # CSV with header row
feature_col = "Storage_GB_SSD"  # source variable (X)
target_col = "Price"    # target variable (y)
# Load data
df = pd.read_csv(URL, header=0)
# Prepare features and target
X = df[[feature_col]]
y = df[target_col]
# Train model
model = LinearRegression()
model.fit(X, y)
# Predict and evaluate
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
# Output results
print("MSE:", mse)
print("R^2:", r2)

MSE: 309304.90367538505
R^2: 0.059253664070951784


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Configuration: adjust these for your data
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv" # CSV with header row
feature_col = "CPU_core"  # source variable (X)
target_col = "Price"    # target variable (y)
# Load data
df = pd.read_csv(URL, header=0)
# Prepare features and target
X = df[[feature_col]]
y = df[target_col]
# Train model
model = LinearRegression()
model.fit(X, y)
# Predict and evaluate
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
# Output results
print("MSE:", mse)
print("R^2:", r2)

MSE: 259397.494681478
R^2: 0.2110463178208084


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Configuration: adjust these for your data
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv" # CSV with header row
feature_col = "OS"  # source variable (X)
target_col = "Price"    # target variable (y)
# Load data
df = pd.read_csv(URL, header=0)
# Prepare features and target
X = df[[feature_col]]
y = df[target_col]
# Train model
model = LinearRegression()
model.fit(X, y)
# Predict and evaluate
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
# Output results
print("MSE:", mse)
print("R^2:", r2)

MSE: 312622.2168179772
R^2: 0.0491641047172533


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Configuration: adjust these for your data
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv" # CSV with header row
feature_col = "GPU"  # source variable (X)
target_col = "Price"    # target variable (y)
# Load data
df = pd.read_csv(URL, header=0)
# Prepare features and target
X = df[[feature_col]]
y = df[target_col]
# Train model
model = LinearRegression()
model.fit(X, y)
# Predict and evaluate
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
# Output results
print("MSE:", mse)
print("R^2:", r2)

MSE: 301459.33330554154
R^2: 0.0831158514821474


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Configuration: adjust these for your data
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv" # CSV with header row
feature_col = "Category"  # source variable (X)
target_col = "Price"    # target variable (y)
# Load data
df = pd.read_csv(URL, header=0)
# Prepare features and target
X = df[[feature_col]]
y = df[target_col]
# Train model
model = LinearRegression()
model.fit(X, y)
# Predict and evaluate
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
# Output results
print("MSE:", mse)
print("R^2:", r2)

MSE: 301847.6094560483
R^2: 0.08193491525521568


In [None]:
#The code trains polynomial regression models of degrees 2, 3, and 5 on a single feature to predict a target. It computes MSE and R^2 for each model and reports the best degree by R^2.

import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Configuration: adjust file path and column names as needed
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv" # CSV with header row # CSV with header row
feature_col = "CPU_frequency"  # source variable (X)
target_col = "Price"    # target variable (y)

# Load data
df = pd.read_csv(URL, header=0)

# Prepare single-feature input
X = df[[feature_col]]
y = df[target_col]

# Degrees for polynomial regression
degrees = [2, 3, 5]
results = []

for degree in degrees:
    # Generate polynomial features for the single input feature
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)

    # Train linear regression on the expanded features
    model = LinearRegression()
    model.fit(X_poly, y)

    # Evaluate on training data
    y_pred = model.predict(X_poly)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)

    results.append({"degree": degree, "mse": mse, "r2": r2})

# Best model by R^2
best = max(results, key=lambda r: r["r2"])

# Display results
print("Results by degree:")
for r in results:
    print(f"Degree {r['degree']}: MSE={r['mse']:.6f}, R2={r['r2']:.6f}")
print("Best model: degree", best["degree"], "R2=", best["r2"])

Results by degree:
Degree 2: MSE=249022.665968, R2=0.242601
Degree 3: MSE=241024.863038, R2=0.266926
Degree 5: MSE=229137.295481, R2=0.303082
Best model: degree 5 R2= 0.30308227064438076


In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Configuration: adjust as needed
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv"  # CSV with header row
target_col = "Price"
feature_col = "CPU_frequency"                # name of the target column

# Load data
df = pd.read_csv(URL, header=0)

# Validate columns
if target_col not in df.columns:
    raise ValueError(f"Target column '{target_col}' not found in data. Columns: {list(df.columns)}")

# Select all columns except the target column as potential features
all_features = df.columns.drop(target_col)

# Drop identifier columns or columns that should not be used as features directly
cols_to_drop_from_features = ['Unnamed: 0.1', 'Unnamed: 0'] # Example: drop old index columns
features_df = df[all_features].drop(columns=cols_to_drop_from_features, errors='ignore')

# Identify categorical columns among the remaining features
categorical_features = features_df.select_dtypes(include='object').columns

# Apply one-hot encoding to the identified categorical features
# drop_first=True helps to avoid multicollinearity
X = pd.get_dummies(features_df, columns=categorical_features, drop_first=True)
y = df[target_col]

# Create a pipeline: scale features, generate polynomial features, then fit linear regression
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("model", LinearRegression())
])

# Train model
pipeline.fit(X, y)

# Evaluate on training data
y_pred = pipeline.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print("MSE:", mse)
print("R^2:", r2)

MSE: 15166.741996629014
R^2: 0.9538705762767853


In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod2.csv"
target_col = "Price"

df = pd.read_csv(URL, header=0)

y = df[target_col]

# Identify all potential features (excluding the target column)
all_features = df.columns.drop(target_col)

# Drop identifier columns or columns that are not suitable for direct use
cols_to_drop_from_X = ['Unnamed: 0.1', 'Unnamed: 0']
X_df = df[all_features].drop(columns=cols_to_drop_from_X, errors='ignore')

# Separate numerical and categorical columns for preprocessing
numerical_features = X_df.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_df.select_dtypes(include='object').columns.tolist()

# Create the ColumnTransformer
# - 'num_poly' pipeline scales numerical features and then applies polynomial features
# - 'cat' pipeline applies one-hot encoding to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num_poly', Pipeline([
            ('scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=2, include_bias=False)) # Degree will be tuned by GridSearchCV
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ],
    remainder='passthrough' # Pass through any other columns not explicitly handled
)

# Define the full pipeline: preprocessing + Ridge regression
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", Ridge())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    "preprocessor__num_poly__poly__degree": [2, 3], # Targeting the 'poly' step inside 'num_poly' transformer
    "model__alpha": [0.0001,0.001,0.01, 0.1, 1, 10]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid.fit(X_df, y)

best = grid.best_estimator_
y_pred = best.predict(X_df)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print("MSE:", mse)
print("R^2:", r2)
print("Best params:", grid.best_params_)

MSE: 25774.467434433373
R^2: 0.9216073346676935
Best params: {'model__alpha': 0.1, 'preprocessor__num_poly__poly__degree': 2}
