In [461]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, classification_report
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import zscore

In [462]:
data = pd.read_csv('./final_proj_data.csv')
test_data = pd.read_csv('./final_proj_test.csv')

In [463]:
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 231 entries, Var1 to y
dtypes: float64(191), int64(2), object(38)
memory usage: 17.6+ MB
None
             Var1   Var2          Var3        Var4          Var5  \
count  133.000000  266.0    266.000000  280.000000  2.410000e+02   
mean    14.977444    0.0    341.052632    0.096429  2.338101e+05   
std     66.456008    0.0   2810.606975    0.928243  5.532305e+05   
min      0.000000    0.0      0.000000    0.000000  0.000000e+00   
25%      0.000000    0.0      0.000000    0.000000  0.000000e+00   
50%      0.000000    0.0      0.000000    0.000000  0.000000e+00   
75%     16.000000    0.0      0.000000    0.000000  1.172350e+05   
max    680.000000    0.0  42588.000000    9.000000  3.024000e+06   

               Var6         Var7  Var8         Var9         Var10  ...  \
count   8980.000000  8995.000000   0.0   133.000000  2.410000e+02  ...   
mean    1340.916258     6.860700   NaN    61.383459  3.672943

In [464]:
print(data.isnull().sum())

Var1       9867
Var2       9734
Var3       9734
Var4       9720
Var5       9759
          ...  
Var227        0
Var228        0
Var229     5561
Var230    10000
y             0
Length: 231, dtype: int64


In [465]:
col_threshold = int(data.shape[0] * 0.3)
cols_to_drop = data.columns[data.isnull().sum() > col_threshold]
data.drop(columns=cols_to_drop, inplace=True)

In [466]:
# row_threshold = int(data.shape[1] * 0.8)
# row_threshold
# data.dropna(thresh=row_threshold, inplace=True)


In [467]:
num_imputer = SimpleImputer(strategy='median')
for col in data.select_dtypes(include=['float64', 'int64']):
    data[col] = num_imputer.fit_transform(data[[col]])

In [468]:
cat_imputer = SimpleImputer(strategy='most_frequent')
for col in data.select_dtypes(include=['object', 'category']):
    data[col] = cat_imputer.fit_transform(data[[col]]).ravel()

In [469]:
categorical_features = data.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_features.remove('y')

In [470]:
target_encoder = TargetEncoder(cols=categorical_features)
data[categorical_features] = target_encoder.fit_transform(data[categorical_features], data['y'])

In [471]:
z_scores = data[numerical_features].apply(zscore)
data = data[(np.abs(z_scores) < 3).all(axis=1)]
print(f"Dataset size after outlier removal: {data.shape}")

Dataset size after outlier removal: (7303, 68)


In [472]:
model_for_importance = RandomForestClassifier(random_state=42)
model_for_importance.fit(data[numerical_features], data['y'])
feature_importances = pd.Series(model_for_importance.feature_importances_, index=numerical_features)
important_features = feature_importances[feature_importances > 0.01].index.tolist()
print(f"Selected important features: {important_features}")
numerical_features = important_features

Selected important features: ['Var6', 'Var13', 'Var21', 'Var22', 'Var24', 'Var25', 'Var28', 'Var38', 'Var57', 'Var73', 'Var74', 'Var76', 'Var81', 'Var83', 'Var85', 'Var109', 'Var112', 'Var113', 'Var119', 'Var123', 'Var125', 'Var126', 'Var133', 'Var134', 'Var140', 'Var144', 'Var149', 'Var153', 'Var160', 'Var163']


In [473]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

In [474]:
oversampler = SMOTE(random_state=42)

In [475]:
model = RandomForestClassifier(random_state=42)

In [476]:
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('oversampler', oversampler),
    ('classifier', model)
])

In [477]:
X = data.drop(columns=['y'])
y = data['y']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [478]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)

In [479]:
print("Balanced Accuracy:", balanced_accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Balanced Accuracy: 0.7452441431593517
              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93      1259
         1.0       0.55      0.56      0.56       202

    accuracy                           0.88      1461
   macro avg       0.74      0.75      0.74      1461
weighted avg       0.88      0.88      0.88      1461



In [480]:
X_test = test_data
test_data['y'] = pipeline.predict(X_test)

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
test_data[['index', 'y']].to_csv('/mnt/data/final_predictions.csv', index=False)

print("Predictions saved to 'final_predictions.csv'")