In [1363]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.metrics import balanced_accuracy_score, classification_report
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import zscore

In [1364]:
data = pd.read_csv('./final_proj_data.csv')
test_data = pd.read_csv('./final_proj_test.csv')

In [1365]:
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 231 entries, Var1 to y
dtypes: float64(191), int64(2), object(38)
memory usage: 17.6+ MB
None
             Var1   Var2          Var3        Var4          Var5  \
count  133.000000  266.0    266.000000  280.000000  2.410000e+02   
mean    14.977444    0.0    341.052632    0.096429  2.338101e+05   
std     66.456008    0.0   2810.606975    0.928243  5.532305e+05   
min      0.000000    0.0      0.000000    0.000000  0.000000e+00   
25%      0.000000    0.0      0.000000    0.000000  0.000000e+00   
50%      0.000000    0.0      0.000000    0.000000  0.000000e+00   
75%     16.000000    0.0      0.000000    0.000000  1.172350e+05   
max    680.000000    0.0  42588.000000    9.000000  3.024000e+06   

               Var6         Var7  Var8         Var9         Var10  ...  \
count   8980.000000  8995.000000   0.0   133.000000  2.410000e+02  ...   
mean    1340.916258     6.860700   NaN    61.383459  3.672943

In [1366]:
print(data.isnull().sum())

Var1       9867
Var2       9734
Var3       9734
Var4       9720
Var5       9759
          ...  
Var227        0
Var228        0
Var229     5561
Var230    10000
y             0
Length: 231, dtype: int64


In [1367]:
col_threshold = int(data.shape[0] * 0.3)
cols_to_drop = data.columns[data.isnull().sum() > col_threshold]
data.drop(columns=cols_to_drop, inplace=True)

In [1368]:
unique_threshold = int(data.shape[0] * 0.3)
cols_with_many_uniques = [col for col in data.select_dtypes(include=['object', 'category']).columns 
                           if data[col].nunique() > unique_threshold]
print(f"Dropping columns with too many unique values: {cols_with_many_uniques}")
data.drop(columns=cols_with_many_uniques, inplace=True)

Dropping columns with too many unique values: ['Var202', 'Var217']


In [1369]:
X_test = test_data.drop(columns=cols_to_drop)

In [1370]:
num_imputer = SimpleImputer(strategy='median')
num_cols = data.drop('y', axis=1).select_dtypes(include=['float64', 'int64']).columns

data[num_cols] = num_imputer.fit_transform(data[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

In [1371]:
cat_imputer = SimpleImputer(strategy='most_frequent')
cat_cols = data.drop('y', axis=1).select_dtypes(include=['object', 'category']).columns

data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

In [1372]:
categorical_features = data.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_features.remove('y')

In [1373]:
target_encoder = TargetEncoder(cols=categorical_features)
data[categorical_features] = target_encoder.fit_transform(data[categorical_features], data['y'])
test_data[categorical_features] = target_encoder.transform(test_data[categorical_features])

In [1374]:
z_scores = data[numerical_features].apply(zscore)
data = data[(np.abs(z_scores) < 3).all(axis=1)]
print(f"Dataset size after outlier removal: {data.shape}")

Dataset size after outlier removal: (7303, 66)


In [1375]:
X = data.drop(columns=['y'])
y = data['y']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [1376]:
model_for_importance = RandomForestClassifier(random_state=42)
model_for_importance.fit(X_train[numerical_features], y_train)
feature_importances = pd.Series(model_for_importance.feature_importances_, index=numerical_features)
important_features = feature_importances[feature_importances > 0.01].index.tolist()
print(f"Selected important features: {important_features}")
numerical_features = important_features

Selected important features: ['Var6', 'Var13', 'Var21', 'Var22', 'Var24', 'Var25', 'Var28', 'Var38', 'Var57', 'Var73', 'Var74', 'Var76', 'Var81', 'Var83', 'Var85', 'Var109', 'Var112', 'Var113', 'Var119', 'Var123', 'Var125', 'Var126', 'Var133', 'Var134', 'Var140', 'Var144', 'Var149', 'Var153', 'Var160', 'Var163']


In [1377]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

In [1378]:
oversampler = SMOTE(random_state=42)

In [1379]:
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

stacking_model = StackingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
    ],
    final_estimator=RandomForestClassifier(random_state=42)
)
voting_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
    ],
    voting='soft',
)

In [1380]:
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('oversampler', oversampler),
    ('classifier', voting_model)
])

In [1381]:
param_grid = {
    'classifier__rf__n_estimators': [100, 200],
    'classifier__rf__max_depth': [10, None],
    'classifier__rf__min_samples_split': [2, 10],
    'classifier__rf__min_samples_leaf': [1, 4],
    'classifier__rf__max_features': ['sqrt', None]
}

In [1382]:
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=3,
    scoring='balanced_accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

In [1383]:
print(f"Best parameters from RandomizedSearchCV: {grid_search.best_params_}")
pipeline.set_params(**grid_search.best_params_)

Best parameters from RandomizedSearchCV: {'classifier__rf__max_depth': 10, 'classifier__rf__max_features': None, 'classifier__rf__min_samples_leaf': 1, 'classifier__rf__min_samples_split': 2, 'classifier__rf__n_estimators': 200}


In [1384]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)

In [1385]:
print("Balanced Accuracy:", balanced_accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Balanced Accuracy: 0.8409392964713469
              precision    recall  f1-score   support

           0       0.97      0.82      0.89      1259
           1       0.43      0.86      0.58       202

    accuracy                           0.83      1461
   macro avg       0.70      0.84      0.73      1461
weighted avg       0.90      0.83      0.85      1461



In [1386]:
pipeline.fit(X, y)

In [1387]:
test_data['y'] = pipeline.predict(X_test)

In [1388]:
test_data[['y']].to_csv('./submission.csv', index_label='index')

print("Predictions saved to 'submission.csv'")

Predictions saved to 'submission.csv'
