In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [7]:
from sklearn.impute import SimpleImputer
imputer_full = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
x = imputer_full.fit_transform(x)

# Determine which columns in the original DataFrame are numeric and present in x,
# then check missing values only on those numeric columns to avoid converting strings.
numeric_cols = dataset.select_dtypes(include=[np.number]).columns
numeric_indices = [dataset.columns.get_loc(col) for col in numeric_cols if dataset.columns.get_loc(col) < x.shape[1]]

if numeric_indices:
	numeric_array = x[:, numeric_indices].astype(float)
	print("Missing values after full imputation (numeric columns):", np.sum(np.isnan(numeric_array)))
else:
	print("No numeric columns found in x to check for NaNs.")

print(x)

Missing values after full imputation (numeric columns): 0
[[8 'Albania' 'Africa' ... 0 0 'High Cost']
 [8 'Albania' 'Africa' ... 0 0 'High Cost']
 [8 'Albania' 'Africa' ... 0 0 'High Cost']
 ...
 [894 'Zambia' 'Europe' ... 0 0 'High Cost']
 [894 'Zambia' 'Europe' ... 0 0 'High Cost']
 [716 'Zimbabwe' 'Europe' ... 0 0 0]]


In [8]:
print(y)

['Estimated value' 'Estimated value' 'Estimated value' ...
 'Estimated value' 'Estimated value' 'Estimated value']


In [10]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])

In [11]:
print(x)

[[8 'Albania' 'Africa' ... 0 0 'High Cost']
 [8 'Albania' 'Africa' ... 0 0 'High Cost']
 [8 'Albania' 'Africa' ... 0 0 'High Cost']
 ...
 [894 'Zambia' 'Europe' ... 0 0 'High Cost']
 [894 'Zambia' 'Europe' ... 0 0 'High Cost']
 [716 'Zimbabwe' 'Europe' ... 0 0 0]]


In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Identify all categorical columns to be encoded.
# Based on the dataset, only column 0 (Country) is categorical in the current 'x' array.
categorical_features = [0]

# Create the ColumnTransformer to apply OneHotEncoder to all categorical features
# and pass through other numerical features.
# Request dense output from OneHotEncoder to avoid sparse output errors when passthrough includes non-numeric columns.
# Use `sparse_output=False` (scikit-learn >=1.2) instead of deprecated `sparse=False`.
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

x = ct.fit_transform(x)

# Ensure result is a numpy array (OneHotEncoder with sparse_output=False already returns dense)
x = x if isinstance(x, np.ndarray) else (x.toarray() if hasattr(x, 'toarray') else np.array(x))

In [17]:
print(x)

[[1.0 0.0 0.0 ... 0 0 'High Cost']
 [1.0 0.0 0.0 ... 0 0 'High Cost']
 [1.0 0.0 0.0 ... 0 0 'High Cost']
 ...
 [0.0 0.0 0.0 ... 0 0 'High Cost']
 [0.0 0.0 0.0 ... 0 0 'High Cost']
 [0.0 0.0 0.0 ... 0 0 0]]


In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [19]:
print(x_train)

[[0.0 0.0 0.0 ... 0 0 'High Cost']
 [0.0 0.0 0.0 ... 0 0 'High Cost']
 [0.0 0.0 0.0 ... 0 0 'High Cost']
 ...
 [0.0 0.0 0.0 ... 0 0 'High Cost']
 [0.0 0.0 0.0 ... 0 0 'High Cost']
 [0.0 0.0 0.0 ... 0 0 'High Cost']]


In [20]:
print(x_test)

[[0.0 0.0 0.0 ... 0 0 'High Cost']
 [0.0 0.0 0.0 ... 0 0 'Medium Cost']
 [0.0 0.0 0.0 ... 0.47 1.56 'High Cost']
 ...
 [0.0 0.0 0.0 ... 0 0 'Medium Cost']
 [0.0 0.0 0.0 ... 0 0 'High Cost']
 [0.0 0.0 0.0 ... 0 0 'High Cost']]


In [21]:
print(y_train)

['Estimated value' 'Estimated value' 'Estimated value' ...
 'Estimated value' 'Estimated value' 'Estimated value']


In [22]:
print(y_test)

['Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated value' 'Estimated value'
 'Estimated value' 'Estimated value' 'Estimated val

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# detect numeric columns from column index 3 onward and scale only those
cols_to_scale = []
for j in range(3, x_train.shape[1]):
	try:
		_ = x_train[:, j].astype(float)
		cols_to_scale.append(j)
	except Exception:
		continue

if cols_to_scale:
	x_train[:, cols_to_scale] = sc.fit_transform(x_train[:, cols_to_scale].astype(float))
	x_test[:, cols_to_scale] = sc.transform(x_test[:, cols_to_scale].astype(float))
else:
	print("No numeric columns found to scale from column 3 onward.")

In [25]:
print(x)

[[1.0 0.0 0.0 ... 0 0 'High Cost']
 [1.0 0.0 0.0 ... 0 0 'High Cost']
 [1.0 0.0 0.0 ... 0 0 'High Cost']
 ...
 [0.0 0.0 0.0 ... 0 0 'High Cost']
 [0.0 0.0 0.0 ... 0 0 'High Cost']
 [0.0 0.0 0.0 ... 0 0 0]]


In [26]:
print(y)

['Estimated value' 'Estimated value' 'Estimated value' ...
 'Estimated value' 'Estimated value' 'Estimated value']


In [27]:
print(pd.isnull(x).sum())

0
