In [44]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer

folder = 'data'
file_data = folder+'/breast-cancer.csv'

# Check if foder exists
if not os.path.exists(folder):
    os.makedirs(folder)
    
# Check if the file exists
if not os.path.exists(file_data):
    # If the file doesn't exist, download it
    ! wget -P ./data https://raw.githubusercontent.com/01-edu/public/master/subjects/ai/pipeline/data/breast-cancer.csv
else:
    print(f"{file_data} already exists.")


data/breast-cancer.csv already exists.


In [45]:
# Load the breast-cancer.csv file
df = pd.read_csv(file_data)

df.head()

Unnamed: 0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
0,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
1,50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
2,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
3,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events
4,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,no-recurrence-events


In [46]:
# Define column names
column_names = ['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 
                'deg-malig', 'breast', 'breast-quad', 'irradiat', 'Class']

# Reload the dataset with column names
df = pd.read_csv(file_data, names=column_names)

In [47]:
# Drop the "Class" column
df = df.drop(columns=['Class'])
# Drop NaN values
df = df.dropna()
# Split the data into training and test sets
X_train, X_test = train_test_split(df, test_size=0.2, random_state=43)

In [48]:
unique_values_per_feature = X_train.nunique()

print("Question 1")
unique_values_per_feature

Question 1


age             6
menopause       3
tumor-size     11
inv-nodes       6
node-caps       2
deg-malig       3
breast          2
breast-quad     5
irradiat        2
dtype: int64

In [49]:
# Define the ordinal, nominal, and target variables based on the assumptions
ordinal_features = ['age', 'menopause', 'tumor-size', 'inv-nodes', 'deg-malig']
nominal_features = ['node-caps', 'breast', 'breast-quad', 'irradiat']
target_variable = 'your_target_variable_name'  # Replace with the actual target variable name
# Select the nominal features for OHE
ohe_cols = ['node-caps', 'breast', 'breast-quad', 'irradiat']

In [50]:
# Sort the test set by index to ensure consistent ordering
X_test_sorted = X_test.sort_index()
# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)
# Fit the OneHotEncoder on the training data
ohe.fit(X_train[ohe_cols])
# Transform the sorted test set using the One Hot Encoder fitted earlier on the training set
ohe_test_data_sorted = ohe.transform(X_test_sorted[ohe_cols])
# Transform the test set
ohe_test_transformed = ohe.transform(X_test[ohe_cols])
encoded_feature_names = ohe.get_feature_names_out(ohe_cols)

# Print the transformed test set and feature names
print("input: ohe.transform(X_test[ohe_cols])[:10]")
print("output:\n", ohe_test_data_sorted[:10])
print("\ninput: ohe.get_feature_names(ohe_cols)")
print("output:\n", encoded_feature_names)

input: ohe.transform(X_test[ohe_cols])[:10]
output:
 [[1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0.]
 [1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0.]
 [1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1.]
 [1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0.]]

input: ohe.get_feature_names(ohe_cols)
output:
 ['node-caps_no' 'node-caps_yes' 'breast_left' 'breast_right'
 'breast-quad_central' 'breast-quad_left_low' 'breast-quad_left_up'
 'breast-quad_right_low' 'breast-quad_right_up' 'irradiat_no'
 'irradiat_yes']


In [51]:
print("Question 2")
print(ohe_test_data_sorted[:10])

Question 2
[[1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0.]
 [1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0.]
 [1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1.]
 [1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0.]]


In [None]:
# Define columns for ordinal encoding and their orderings
ordinal_cols = ["menopause", "age", "tumor-size", "inv-nodes", "deg-malig"]
ordinal_categories = [
    ['lt40', 'premeno', 'ge40'],
    ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99'],
    ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59'],
    ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23', '24-26', '27-29', '30-32', '33-35', '36-39'],
    [1, 2, 3]
]

# Initialize OrdinalEncoder with the defined categories
oe = OrdinalEncoder(categories=ordinal_categories)

# Fit the OrdinalEncoder on the training data
oe.fit(X_train[ordinal_cols])

# Transform the test set
oe_test_transformed = oe.transform(X_test[ordinal_cols])

In [None]:
print("Question 3")
# Display the first 10 rows of the transformed test set
oe_test_transformed[:10]

In [None]:
# Create a column transformer that combines the One Hot and Ordinal encoders
column_transformer = make_column_transformer(
    (ohe, ohe_cols),
    (oe, ordinal_cols),
    remainder='passthrough'
)

# Fit the column transformer on the training data
column_transformer.fit(X_train)

# Transform the test data
transformed_test_data = column_transformer.transform(X_test)

In [None]:
print("Question 4")
# Show the first 2 rows of the transformed test data
transformed_test_data[:2]