In [4]:
from google.colab import drive
drive.mount("/content/drive")

# !source /content/drive/MyDrive/colab_env/bin/activate; pip install Pypdf

Mounted at /content/drive


In [5]:
import sys
sys.path.append("/content/drive/MyDrive/colab_env/lib/python3.11/site-packages")

In [6]:
import math
import os
import pprint

import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
print('TF: {}'.format(tf.__version__))

import apache_beam as beam
print('Beam: {}'.format(beam.__version__))

import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.keras_lib import tf_keras
print('Transform: {}'.format(tft.__version__))

from tfx_bsl.public import tfxio
from tfx_bsl.coders.example_coder import RecordBatchToExamplesEncoder

tf.compat.v1.disable_eager_execution()

TF: 2.18.0
Beam: 2.65.0
Transform: 1.16.0


In [11]:
# !chmod +x /content/drive/MyDrive/colab_env/bin/python
# !chmod +x /content/drive/MyDrive/colab_env/bin/pip

# !source /content/drive/MyDrive/colab_env/bin/activate; pip install 'tensorflow_data_validation[visualization]<2'

In [80]:
# import tensorflow_data_validation as tfdv

# RAW_SCHEMA = tfdv.load_schema_text("./schema.pbtxt")
# print("\nSchema loaded successfully using tfdv.load_schema_text:")

# # Convert the tfdv.Schema object to a tf.Transform.DatasetMetadata object ---
# # This is the crucial step to bridge tfdv's schema with tf.Transform's API.
# # tf.Transform's DatasetMetadata can be initialized directly with a tfdv.Schema object.
# RAW_DATA_METADATA = tft.DatasetMetadata(schema=RAW_SCHEMA)
# print("\nConverted to tf.Transform DatasetMetadata object:")

RAW_DATA_FEATURE_SPEC = {
    'Unname': tf.io.FixedLenFeature([], tf.int64),
    'Company': tf.io.FixedLenFeature([], tf.string),
    'TypeName': tf.io.FixedLenFeature([], tf.string),
    'Inches': tf.io.FixedLenFeature([], tf.float32),
    'ScreenResolution': tf.io.FixedLenFeature([], tf.string),
    'Cpu': tf.io.FixedLenFeature([], tf.string),
    'Ram': tf.io.FixedLenFeature([], tf.string),
    'Memory': tf.io.FixedLenFeature([], tf.string),
    'Gpu': tf.io.FixedLenFeature([], tf.string),
    'OpSys': tf.io.FixedLenFeature([], tf.string),
    'Weight': tf.io.FixedLenFeature([], tf.string),
    'Price': tf.io.FixedLenFeature([], tf.float32),
}
RAW_DATA_METADATA = tft.tf_metadata.dataset_metadata.DatasetMetadata(
    tft.tf_metadata.schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC)
)


In [25]:
import tensorflow as tf
import tensorflow_transform as tft

def parse_storage_string(storage_string):
    storage_string = tf.strings.lower(storage_string)
    parts = tf.strings.split(storage_string, sep=' + ')

    # Extract sizes and units
    numeric_part = tf.strings.regex_replace(parts, r'[^0-9\.]', ' ')
    numeric_part = tf.strings.strip(numeric_part)
    size_values = tf.strings.to_number(tf.strings.split(numeric_part, sep=' ').values, tf.float32)

    # Detect units (TB or GB)
    is_tb = tf.strings.regex_full_match(parts, '.*tb.*')
    multiplier = tf.where(is_tb, 1024.0, 1.0)
    size_gb = size_values * multiplier

    # Type masks
    is_ssd = tf.strings.regex_full_match(parts, '.*ssd.*')
    is_hdd = tf.strings.regex_full_match(parts, '.*hdd.*')
    is_flash = tf.strings.regex_full_match(parts, '.*flash.*')

    # Sum sizes by type using boolean masks
    ssd_gb = tf.reduce_sum(tf.where(is_ssd, size_gb, 0.0))
    hdd_gb = tf.reduce_sum(tf.where(is_hdd, size_gb, 0.0))
    flash_gb = tf.reduce_sum(tf.where(is_flash, size_gb, 0.0))
    total_gb = ssd_gb + hdd_gb + flash_gb

    return {
        'ssd_gb': ssd_gb,
        'hdd_gb': hdd_gb,
        'flash_gb': flash_gb,
        'total_storage_gb': total_gb
    }


In [103]:
def preprocessing_fn(inputs):
    """
    tf.Transform preprocessing function for laptop data.

    Args:
      inputs: A dictionary where keys are feature names and values are raw Tensors.

    Returns:
      A dictionary of transformed Tensors.
    """
    outputs = {}

    # --- 1. 'Company' (Categorical) ---
    # Ensure Company is not empty, replace with 'UNKNOWN_COMPANY' if it is
    company = tf.strings.lower(inputs['Company'])
    company = tf.strings.strip(company)
    company = tf.where(
        tf.strings.length(company) > 0,
        company,
        tf.constant("UNKNOWN_COMPANY", dtype=tf.string)
    )
    outputs['company_xf'] = tft.compute_and_apply_vocabulary(company)

    # --- 2. 'Memory' (Categorical) ---
    memory_str = tf.strings.strip(tf.squeeze(inputs['Memory'], axis=1) if len(inputs['Memory'].shape) == 2 else inputs['Memory'])

    storage_features = tf.map_fn(
        parse_storage_string,
        memory_str,
        fn_output_signature={
            'ssd_gb': tf.TensorSpec([], tf.float32),
            'hdd_gb': tf.TensorSpec([], tf.float32),
            'flash_gb': tf.TensorSpec([], tf.float32),
            'total_storage_gb': tf.TensorSpec([], tf.float32),
        },
        parallel_iterations=32,
        name="storage_features"
    )
    outputs['scaled_ssd'] = tft.scale_to_z_score(storage_features['ssd_gb'])
    outputs['scaled_hdd'] = tft.scale_to_z_score(storage_features['hdd_gb'])
    outputs['scaled_flash'] = tft.scale_to_z_score(storage_features['flash_gb'])
    outputs['scaled_total'] = tft.scale_to_z_score(storage_features['total_storage_gb'])

    # --- 3 RAM (Numerical) ---
    ram_str = inputs['Ram']
    ram = tf.strings.regex_replace(ram_str, 'GB', '')
    ram = tf.strings.to_number(ram, tf.float32)
    outputs['ram_scaled'] = tft.scale_to_z_score(ram)

    # --- 4. OpSys (categorical) ---
    def normalize_os_single(os_str):
        os_str = tf.strings.lower(tf.strings.strip(os_str))
        is_windows = tf.strings.regex_full_match(os_str, '.*windows.*')
        is_mac = tf.strings.regex_full_match(os_str, '.*mac.*')

        return tf.case([
            (is_windows, lambda: tf.constant("windows")),
            (is_mac, lambda: tf.constant("macos")),
        ], default=lambda: os_str)

    normalized_opsys = tf.map_fn(
        normalize_os_single,
        inputs["OpSys"],
        fn_output_signature=tf.TensorSpec([], tf.string)
    )
    outputs['opsys_xf'] = tft.compute_and_apply_vocabulary(normalized_opsys)

    # --- 5. TypeName (categorical) ---
    type_name = tf.strings.lower(inputs["TypeName"])
    type_name = tf.strings.strip(type_name)
    outputs['typename_xf'] = tft.compute_and_apply_vocabulary(type_name)

    # --- 6. ScreenResolution (categorical/numerical) ---
              # 1. Normalize string
    screen_res_str = tf.strings.lower(inputs["ScreenResolution"])
    screen_res_str = tf.strings.strip(screen_res_str)

              # 2. Extract resolution part (e.g. 1920x1080)
    resolution_match = tf.strings.regex_replace(screen_res_str, r".*(\d{3,5}x\d{3,5})", r"\1")
    resolution_parts = tf.strings.split(resolution_match, "x").to_tensor(default_value="0")
    width = tf.strings.to_number(resolution_parts[:, 0], tf.float32)
    height = tf.strings.to_number(resolution_parts[:, 1], tf.float32)

              # 3. Scale width and height
    outputs["scaled_width"] = tft.scale_to_z_score(width)
    outputs["scaled_height"] = tft.scale_to_z_score(height)

              # 4. flag based extraction
    def contains(keyword):
         is_present = tf.strings.regex_full_match(screen_res_str, f".*{keyword}.*")
         return tf.cast(is_present, tf.float32)

    outputs["has_ips_panel"] = contains("ips panel")
    outputs["has_touchscreen"] = contains("touchscreen")
    outputs["is_4k"] = contains("4k ultra hd")
    outputs["is_retina"] = contains("retina display")
    outputs["is_full_hd"] = contains("full hd")
    outputs["is_quad_hd"] = contains("quad hd\\+")

    # --- 7. CPU (categorical/numerical) ---
              # Clean and lowercase CPU strings
    cpu = tf.strings.strip(inputs['Cpu'])
    cpu = tf.strings.lower(cpu)

              # 1. Extract CPU brand (intel / amd)
    cpu_brand = tf.where(
        tf.strings.regex_full_match(cpu, ".*intel.*"),
        tf.constant("intel"),
        tf.constant("amd")
    )

              # 2. Extract CPU family (e.g. core i5, a9-series, ryzen)
    cpu_family = tf.strings.regex_replace(cpu, r"^(intel|amd)\s+", "")
    cpu_family = tf.strings.regex_replace(cpu_family, r"\s+\d.*", "")  # remove model numbers and GHz

              # 3. Extract CPU clock speed in GHz (e.g., 2.5 from 2.5GHz)
    clock_speed_str = tf.strings.regex_replace(cpu, r".*?([\d\.]+)\s*ghz.*", r"\1")
    clock_speed = tf.strings.to_number(clock_speed_str, out_type=tf.float32)

    outputs['cpu_brand_xf'] = tft.compute_and_apply_vocabulary(cpu_brand)
    outputs['cpu_family_xf'] = tft.compute_and_apply_vocabulary(cpu_family)
    outputs['scaled_cpu_clock_speed_ghz'] = tft.scale_to_z_score(clock_speed)

    # --- 8. GPU (categorical) ---
    gpu = tf.strings.lower(inputs['Gpu'])
    gpu = tf.strings.strip(gpu)
    gpu_arr = tf.strings.split(gpu, sep=" ").to_tensor(default_value='')
    outputs['gpu_brand_xf'] = tft.compute_and_apply_vocabulary(gpu_arr[:, 0])
    outputs['gpu_model_xf'] = tft.compute_and_apply_vocabulary(gpu_arr[:, 1])

    # --- 9. Inches (numerical) --
    inches = tf.cast(inputs["Inches"], tf.float32)
    # Optional: Bucket into size groups (e.g. small <14", medium <15.6", large >=15.6")
    inches_bucket = tf.where(
        inches < 14.0, 0,
        tf.where(inches < 15.6, 1, 2)
    )
    outputs['inches_bucket'] = tf.cast(inches_bucket, tf.float32)

    # --- 10. Weight (numerical) --
    weight = tf.strings.lower(inputs['Weight'])
    weight = tf.strings.regex_replace(weight, 'kg', '')
    weight = tf.strings.to_number(weight, tf.float32)
    outputs['scaled_weight'] = tft.scale_to_z_score(weight)

    # --- 11. Price (numerical) --
    outputs['scaled_price'] = tft.scale_to_z_score(inputs['Price'])

    return outputs

In [107]:
import apache_beam as beam
import tensorflow_transform.beam as tft_beam
import tempfile
import csv
import numpy as np
import random

def split_data(element, train_ratio=0.7, eval_ratio=0.15):
    rnd = random.random()
    if rnd < train_ratio:
        tag = 'train'
    elif rnd < train_ratio + eval_ratio:
        tag = 'eval'
    else:
        tag = 'test'
    return tag, element

# Step 2: Define Split class that uses the fieldnames
class Split(beam.DoFn):
    def process(self, element):
        import numpy as np
        Unname, Company, TypeName, Inches, ScreenResolution, Cpu, Ram, Memory, Gpu, OpSys, Weight, Price = element.split(",")
        yield {
            "Unname": int(Unname.strip()),
            "Company": Company.strip(),
            "TypeName": TypeName.strip(),
            "Inches": float(Inches.strip()),
            "ScreenResolution": ScreenResolution.strip(),
            "Cpu": Cpu.strip(),
            "Ram": Ram.strip(),
            "Memory": Memory.strip(),
            "Gpu": Gpu.strip(),
            "OpSys": OpSys.strip(),
            "Weight": Weight.strip(),
            "Price": float(Price.strip()),
        }

# Step 3: Your Beam + TFT pipeline
with beam.Pipeline() as pipeline:
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):

        raw_data = (
            pipeline
            | "Read Laptop Data" >> beam.io.ReadFromText("laptop_data.csv", skip_header_lines=1)
            | "Parse CSV Rows" >> beam.ParDo(Split())
        )

        # Split the dataset into train/eval/test using tags
        split_data_dict = (
            raw_data
            | "Split Data" >> beam.Map(split_data)
            | "Partition into Train/Eval/Test" >> beam.Partition(
                lambda elem, _: {'train': 0, 'eval': 1, 'test': 2}[elem[0]], 3
            )
        )

        train_data = split_data_dict[0] | "Drop Train Tag" >> beam.Map(lambda x: x[1])
        eval_data = split_data_dict[1] | "Drop Eval Tag" >> beam.Map(lambda x: x[1])
        test_data = split_data_dict[2] | "Drop Test Tag" >> beam.Map(lambda x: x[1])

        # raw_dataset = (raw_data, RAW_DATA_METADATA)

        # Only analyze and transform training data
        raw_train_dataset = (train_data, RAW_DATA_METADATA)

        transformed_train_dataset, transform_fn = (
            raw_train_dataset
            | "Analyze & Transform Train" >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)
        )
        transformed_train_data, transformed_metadata = transformed_train_dataset

        # Apply transform_fn to eval and test data
        transformed_eval_data, _ = (
            ((eval_data, RAW_DATA_METADATA), transform_fn)
            | "Transform Eval Data" >> tft_beam.TransformDataset()
        )

        transformed_test_data, _ = (
            ((test_data, RAW_DATA_METADATA), transform_fn)
            | "Transform Test Data" >> tft_beam.TransformDataset()
        )

        # (Optional) Print to check
        _ = transformed_train_data | "Print Train" >> beam.Map(lambda x: print("Train:", x) or x)
        _ = transformed_eval_data | "Print Eval" >> beam.Map(lambda x: print("Eval:", x) or x)
        _ = transformed_test_data | "Print Test" >> beam.Map(lambda x: print("Test:", x) or x)

        _ = (
            transformed_train_data
            | 'Write Transformed Trained TFRecord' >> beam.io.WriteToTFRecord(
                file_path_prefix="train.tfrecord",
                file_name_suffix='.gz',
                coder=tft.coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
            )
        )

        _ = (
            transformed_train_data
            | 'Write Transformed Eval TFRecord' >> beam.io.WriteToTFRecord(
                file_path_prefix="eval.tfrecord",
                file_name_suffix='.gz',
                coder=tft.coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
            )
        )

        _ = (
            transformed_train_data
            | 'Write Transformed Test TFRecord' >> beam.io.WriteToTFRecord(
                file_path_prefix="test.tfrecord",
                file_name_suffix='.gz',
                coder=tft.coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
            )
        )


'Counter' object has no attribute 'name'
'tuple' object has no attribute 'name'
'Counter' object has no attribute 'name'
'tuple' object has no attribute 'name'
value: "\n\013\n\tConst_4:0\022-vocab_compute_and_apply_vocabulary_vocabulary"

value: "\n\013\n\tConst_7:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"

value: "\n\014\n\nConst_10:0\022/vocab_compute_and_apply_vocabulary_2_vocabulary"

value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_3_vocabulary"

value: "\n\014\n\nConst_16:0\022/vocab_compute_and_apply_vocabulary_4_vocabulary"

value: "\n\014\n\nConst_19:0\022/vocab_compute_and_apply_vocabulary_5_vocabulary"

value: "\n\014\n\nConst_22:0\022/vocab_compute_and_apply_vocabulary_6_vocabulary"

value: "\n\013\n\tConst_4:0\022-vocab_compute_and_apply_vocabulary_vocabulary"

value: "\n\013\n\tConst_7:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"

value: "\n\014\n\nConst_10:0\022/vocab_compute_and_apply_vocabulary_2_vocabulary"

value: "\n\014\n\n

Train: {'company_xf': 7, 'cpu_brand_xf': 0, 'cpu_family_xf': 1, 'gpu_brand_xf': 0, 'gpu_model_xf': 5, 'has_ips_panel': 1.0, 'has_touchscreen': 0.0, 'inches_bucket': 0.0, 'is_4k': 0.0, 'is_full_hd': 0.0, 'is_quad_hd': 0.0, 'is_retina': 1.0, 'opsys_xf': 4, 'ram_scaled': -0.08530744910240173, 'scaled_cpu_clock_speed_ghz': 0.008417012169957161, 'scaled_flash': -0.1601697951555252, 'scaled_hdd': -0.8294079899787903, 'scaled_height': 1.8773751258850098, 'scaled_price': 0.30803734064102173, 'scaled_ssd': -0.2837832570075989, 'scaled_total': -1.0305922031402588, 'scaled_weight': -1.0145057439804077, 'scaled_width': -0.7559967637062073, 'typename_xf': 2}
Train: {'company_xf': 7, 'cpu_brand_xf': 0, 'cpu_family_xf': 1, 'gpu_brand_xf': 0, 'gpu_model_xf': 0, 'has_ips_panel': 0.0, 'has_touchscreen': 0.0, 'inches_bucket': 0.0, 'is_4k': 0.0, 'is_full_hd': 0.0, 'is_quad_hd': 0.0, 'is_retina': 0.0, 'opsys_xf': 4, 'ram_scaled': -0.08530744910240173, 'scaled_cpu_clock_speed_ghz': -0.9932714700698853, 'sca

value: "\n\013\n\tConst_4:0\022-vocab_compute_and_apply_vocabulary_vocabulary"

value: "\n\013\n\tConst_7:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"

value: "\n\014\n\nConst_10:0\022/vocab_compute_and_apply_vocabulary_2_vocabulary"

value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_3_vocabulary"

value: "\n\014\n\nConst_16:0\022/vocab_compute_and_apply_vocabulary_4_vocabulary"

value: "\n\014\n\nConst_19:0\022/vocab_compute_and_apply_vocabulary_5_vocabulary"

value: "\n\014\n\nConst_22:0\022/vocab_compute_and_apply_vocabulary_6_vocabulary"



Eval: {'company_xf': 7, 'cpu_brand_xf': 0, 'cpu_family_xf': 0, 'gpu_brand_xf': 2, 'gpu_model_xf': 2, 'has_ips_panel': 1.0, 'has_touchscreen': 0.0, 'inches_bucket': 1.0, 'is_4k': 0.0, 'is_full_hd': 0.0, 'is_quad_hd': 0.0, 'is_retina': 1.0, 'opsys_xf': 4, 'ram_scaled': 1.5226078033447266, 'scaled_cpu_clock_speed_ghz': 1.0101054906845093, 'scaled_flash': -0.1601697951555252, 'scaled_hdd': -0.8294079899787903, 'scaled_height': 2.5844602584838867, 'scaled_price': 1.8963260650634766, 'scaled_ssd': 0.40718334913253784, 'scaled_total': -0.7638698220252991, 'scaled_weight': -0.3268383741378784, 'scaled_width': 0.510871946811676, 'typename_xf': 2}
Eval: {'company_xf': 0, 'cpu_brand_xf': 0, 'cpu_family_xf': 1, 'gpu_brand_xf': 0, 'gpu_model_xf': 3, 'has_ips_panel': 1.0, 'has_touchscreen': 1.0, 'inches_bucket': 0.0, 'is_4k': 0.0, 'is_full_hd': 1.0, 'is_quad_hd': 0.0, 'is_retina': 0.0, 'opsys_xf': 0, 'ram_scaled': -0.08530744910240173, 'scaled_cpu_clock_speed_ghz': -1.3939467668533325, 'scaled_flash

value: "\n\013\n\tConst_4:0\022-vocab_compute_and_apply_vocabulary_vocabulary"

value: "\n\013\n\tConst_7:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"

value: "\n\014\n\nConst_10:0\022/vocab_compute_and_apply_vocabulary_2_vocabulary"

value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_3_vocabulary"

value: "\n\014\n\nConst_16:0\022/vocab_compute_and_apply_vocabulary_4_vocabulary"

value: "\n\014\n\nConst_19:0\022/vocab_compute_and_apply_vocabulary_5_vocabulary"

value: "\n\014\n\nConst_22:0\022/vocab_compute_and_apply_vocabulary_6_vocabulary"



Test: {'company_xf': 7, 'cpu_brand_xf': 0, 'cpu_family_xf': 0, 'gpu_brand_xf': 2, 'gpu_model_xf': 2, 'has_ips_panel': 1.0, 'has_touchscreen': 0.0, 'inches_bucket': 1.0, 'is_4k': 0.0, 'is_full_hd': 0.0, 'is_quad_hd': 0.0, 'is_retina': 1.0, 'opsys_xf': 4, 'ram_scaled': 1.5226078033447266, 'scaled_cpu_clock_speed_ghz': 0.8097679615020752, 'scaled_flash': -0.1601697951555252, 'scaled_hdd': -0.8294079899787903, 'scaled_height': 2.5844602584838867, 'scaled_price': 2.037041664123535, 'scaled_ssd': 1.789116621017456, 'scaled_total': -0.23042507469654083, 'scaled_weight': -0.3268383741378784, 'scaled_width': 0.510871946811676, 'typename_xf': 2}
Test: {'company_xf': 0, 'cpu_brand_xf': 0, 'cpu_family_xf': 2, 'gpu_brand_xf': 2, 'gpu_model_xf': 2, 'has_ips_panel': 0.0, 'has_touchscreen': 0.0, 'inches_bucket': 2.0, 'is_4k': 0.0, 'is_full_hd': 1.0, 'is_quad_hd': 0.0, 'is_retina': 0.0, 'opsys_xf': 0, 'ram_scaled': -0.8892650604248047, 'scaled_cpu_clock_speed_ghz': -0.5925959944725037, 'scaled_flash': 