In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_1.csv")
numeric_features = df.select_dtypes(include=['int64','float64']).columns.tolist()
categorial_feature = df.select_dtypes(include=['object']).columns.tolist()
categorial_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transfomers = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessing = ColumnTransformer(transformers=[
    ('cat',categorial_transformer,categorial_feature),
    ('num', numerical_transfomers,numeric_features)
])
# Apply the preprocessing transformer
X_transformed = preprocessing.fit_transform(df)

# Get feature names from both transformers
cat_feature_names = preprocessing.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorial_feature)
all_feature_names = numeric_features + list(cat_feature_names)

# Convert to DataFrame
# If X_transformed is sparse (which OneHotEncoder returns by default), use .toarray()
X_df = pd.DataFrame(X_transformed.toarray(), columns=all_feature_names)

# Optional: Preview the cleaned dataset
print(X_df.isnull().sum().sum())





0
