In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from util import *

rand_state = rand_seed()
print(f"Using random state: {rand_state}")

Using random state: 435


### Import data

In [2]:
x_cols = ["age", "gender", "location", "size"]
y_col = "mdm2"
TEST_SIZE = 0.2

df = load_with_columns(x_cols + [y_col])

Datapoints: 2706 --> 1657 (1049 removed)


### Transform data

In [3]:
df = df[df["size"] > 10]
print(df.shape)
x = df.loc[:, x_cols]
y = df[y_col]

x["gender"] = x["gender"].map({"m": 0, "f": 1})
x = pd.get_dummies(x)
y = y.astype(int)

(1501, 5)


### Prepare for fit

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=TEST_SIZE,
    random_state=rand_state,
    stratify=y,
)
x_train, y_train = SMOTE(
    sampling_strategy=0.5,
    random_state=rand_state,
).fit_resample(x_train, y_train)
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

class_weights = compute_class_weight("balanced", classes=np.unique(y), y=y)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

### Fit

In [5]:
model = RandomForestClassifier(
    n_estimators=400,
    max_depth=5,
    class_weight={0: 1, 1: (y.shape-y.sum())/y.sum()},
    random_state=rand_state
)
model.fit(x_train, y_train)

### Assess fit

In [6]:
y_pred = model.predict(x_test)
cnf = confusion_matrix(y_test, y_pred)
summarise_cnf(cnf)

Accuracy: 0.6245847176079734
Precision: 0.2302158273381295
Sensitivity: 0.8421052631578947
Specificity: 0.5931558935361216
[[156 107]
 [  6  32]]
