In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("categorical.csv")

# Drop ID column (not a feature)
df.drop(columns=['id'], inplace=True)

# Handle missing values
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Separate features and target
X = df.drop(columns=['target'])
y = df['target']

# Identify categorical and numerical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Encode categorical variables
ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
one_hot_enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Scaling numerical features
scaler = StandardScaler()

# Feature transformation pipeline
preprocessor = ColumnTransformer([
    ('num', Pipeline([('imputer', num_imputer), ('scaler', scaler)]), num_cols),
    ('ord', Pipeline([('imputer', cat_imputer), ('encoder', ord_enc)]), cat_cols)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline model
baseline_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

baseline_model.fit(X_train, y_train)
y_pred = baseline_model.predict(X_test)
print("Baseline Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(baseline_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Parameters (Grid Search):", grid_search.best_params_)
print("Best Accuracy (Grid Search):", grid_search.best_score_)

# Randomized Search
random_search = RandomizedSearchCV(baseline_model, param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
print("Best Parameters (Random Search):", random_search.best_params_)
print("Best Accuracy (Random Search):", random_search.best_score_)

Baseline Accuracy: 0.8168


              precision    recall  f1-score   support

           0       0.82      0.99      0.90     97640
           1       0.60      0.05      0.09     22360

    accuracy                           0.82    120000
   macro avg       0.71      0.52      0.50    120000
weighted avg       0.78      0.82      0.75    120000

Best Parameters (Grid Search): {'classifier__max_depth': 25, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 300}
Best Accuracy (Grid Search): 0.8294
Best Parameters (Random Search): {'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 250}
Best Accuracy (Random Search): 0.8241


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load dataset
df = pd.read_csv("diamonds.csv")

# Drop ID column (not a feature)
df.drop(columns=['id'], inplace=True)

# Handle missing values
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Fix invalid values in x, y, z (replace zeros with median values)
for col in ['x', 'y', 'z']:
    median_value = df[df[col] > 0][col].median()  # Median of non-zero values
    df[col] = df[col].replace(0, median_value)

# Create new features (volume and price per carat)
df['volume'] = df['x'] * df['y'] * df['z']
df['price_per_carat'] = df['price'] / df['carat']

# Drop features that may cause multicollinearity
df.drop(columns=['x', 'y', 'z'], inplace=True)

# Separate features and target
X = df.drop(columns=['price'])
y = df['price']

# Identify categorical and numerical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Encode categorical variables
ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
one_hot_enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Scaling numerical features
scaler = StandardScaler()

# Feature transformation pipeline
preprocessor = ColumnTransformer([
    ('num', Pipeline([('imputer', num_imputer), ('scaler', scaler)]), num_cols),
    ('ord', Pipeline([('imputer', cat_imputer), ('encoder', ord_enc)]), cat_cols)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline model
baseline_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

baseline_model.fit(X_train, y_train)
y_pred = baseline_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Baseline MAE:", mae)
print("Baseline MSE:", mse)
print("Baseline RMSE:", rmse)

# Hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(baseline_model, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Parameters (Grid Search):", grid_search.best_params_)
print("Best MAE (Grid Search):", -grid_search.best_score_)

# Randomized Search
random_search = RandomizedSearchCV(baseline_model, param_grid, n_iter=10, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
print("Best Parameters (Random Search):", random_search.best_params_)
print("Best MAE (Random Search):", -random_search.best_score_)

Baseline MAE: 5.176612682422838
Baseline MSE: 930.8549032106416
Baseline RMSE: 30.50991483453603

Best Parameters (Grid Search): {'regressor__max_depth': 20, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 200}
Best MAE (Grid Search): 3.892

Best Parameters (Random Search): {'regressor__max_depth': None, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 300}
Best MAE (Random Search): 3.745
