## Sample NB to test preprocess logic

In [None]:
!pip install -U awscli boto3 sagemaker scikit-learn xgboost pandas --quiet

In [None]:
!pip install -U watermark rich --quiet
%load_ext watermark
%load_ext rich

In [None]:
%watermark -p awscli,boto3,sagemaker,sklearn,xgboost,pandas

In [None]:
import numpy as np
import pandas as pd
import io
from io import StringIO
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# import flask
# from flask import Flask
import os
import sys

# np.set_printoptions(threshold=sys.maxsize)
# np.set_printoptions(precision=6)

# Since we get a headerless CSV file, we specify the column names here.
feature_columns_names = [
    "sex",
    "length",
    "diameter",
    "height",
    "whole_weight",
    "shucked_weight",
    "viscera_weight",
    "shell_weight",
]
label_column = "rings"

feature_columns_dtype = {
    "sex": str,
    "length": np.float64,
    "diameter": np.float64,
    "height": np.float64,
    "whole_weight": np.float64,
    "shucked_weight": np.float64,
    "viscera_weight": np.float64,
    "shell_weight": np.float64,
}
label_column_dtype = {"rings": np.float64}


def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z


df1 = pd.read_csv(
    "./data//abalone_train_raw.csv",
    header=None,
    names=feature_columns_names + [label_column],
    dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype),
)

df1.head(2)

In [None]:
import pandas as pd

# Assuming you have a DataFrame named 'df' with 100 rows
# Extract the first row using iloc
first_row = df1.iloc[10]

# Convert the extracted row to a DataFrame
df = first_row.to_frame().T

# Now 'first_row_df' contains a new DataFrame with the first row of 'df'
# df = df1.copy(deep=True)

df.head()

In [None]:
print(f"Received DF", flush=True)
# print(df, flush=True)

# if len(df.columns) == len(feature_columns_names) + 1:
#     # This is a labelled example, includes the ring label
#     print(f"Labelled")
#     df.columns = feature_columns_names + [label_column]
# elif len(df.columns) == len(feature_columns_names):
#     # This is an unlabelled example.
#     print(f"Unlabelled")
#     df.columns = feature_columns_names

numeric_features = list(feature_columns_names)
numeric_features.remove("sex")
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_features = ["sex"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)
print(f"Running transform on entire dataset df1")

data = preprocess.fit_transform(df1)
# print(f"Data after transform: {np.squeeze(data)}", flush=True)
# print(f"Data type: {type(data)}", flush=True)
# print(f"Data shape: {data.shape}", flush=True)
print(data)

single_row = preprocess.transform(df)
print(f"--" * 25)
print(single_row)

In [None]:
import numpy as np
import csv
import io

# Create a 2D NumPy array
array = np.array(
    [
        [
            -1.3317586042173168,
            -1.1425409076053987,
            -1.0579488602777858,
            -1.177706547272754,
            -1.130662184748842,
            -1.1493955859050584,
            -1.139968767909096,
            0.0,
            1.0,
            0.0,
        ]
    ]
)

# Convert the 2D NumPy array to a StringIO object in CSV format
csv_buffer = io.StringIO()
csv_writer = csv.writer(csv_buffer)

for row in array:
    print(row)
    csv_writer.writerow(row)

# Reset the buffer's position to the beginning
csv_buffer.seek(0)

# Now you can use 'csv_buffer' as a StringIO object in CSV format

In [None]:
print(csv_buffer.getvalue())

In [None]:
import xgboost as xgb

In [None]:
array = np.array(
    [
        [
            -1.3317586042173168,
            -1.1425409076053987,
            -1.0579488602777858,
            -1.177706547272754,
            -1.130662184748842,
            -1.1493955859050584,
            -1.139968767909096,
            0.0,
            1.0,
            0.0,
        ]
    ]
)

data = xgb.DMatrix(data=array)
data

In [None]:
import numpy as np
import csv
import io

# Assuming you have a CSV string named 'csv_string'
csv_string = "0.5057643044112818,0.5739700038488806,0.12587805776591277,0.3165182107971498,0.32207651250285446,0.3086720815446205,0.32454880073267034,1.0,0.0,0.0"

# Create a StringIO object from the CSV string
csv_buffer = io.StringIO(csv_string)

# Read the CSV data into a list of lists
csv_reader = csv.reader(csv_buffer)
data = [list(map(float, row)) for row in csv_reader]

# Convert the list of lists to a NumPy array
array = np.array(data)

# Now 'array' is a NumPy array containing the data from the CSV string
print(array)