In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder

# 1. Data Loading (using subsets)
numerical = pd.read_csv('numerical.csv', nrows=10000)
categorical = pd.read_csv('categorical (1).csv', nrows=10000)
target = pd.read_csv('target (1).csv', nrows=10000)

# Encode the categorical variables using one-hot encoding
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_categorical = encoder.fit_transform(categorical)
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical.columns))

# Merging data
data = pd.concat([numerical, encoded_categorical_df], axis=1)
X = data
y = target['TARGET_B']

# 2. Addressing Class Imbalance using Upsampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Upsample minority class
X_train_majority = X_train[y_train == 0]
X_train_minority = X_train[y_train == 1]

X_train_minority_upsampled = resample(X_train_minority, 
                                      replace=True, 
                                      n_samples=len(X_train_majority), 
                                      random_state=42)

X_train_upsampled = pd.concat([X_train_majority, X_train_minority_upsampled])
y_train_upsampled = y_train.loc[X_train_upsampled.index]

# 3. Model Training with GridSearchCV using F1 Score as the metric
clf = RandomForestClassifier()
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
f1_scorer = make_scorer(f1_score)
grid_search = GridSearchCV(clf, param_grid, scoring=f1_scorer, cv=5)
grid_search.fit(X_train_upsampled, y_train_upsampled)

# 4. Model Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


