In [8]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [9]:
df = pd.read_csv("../data/trialPromoResults.csv")

In [10]:
sex_map = {"M": 0, "F": 1}
mstatus_map = {"single":0, "married":1, "widowed":2, "divorced":3}
occupation_map = {'legal':0, 'IT':1, 'government':2, 'manuf':3, 'retired':4, 
                  'finance':5,'construct':6, 'education':7, 'medicine':8}
education_map = {'postgrad':3, 'secondary':0, 'tertiary':1, 'professional':2}
df["sex"] = df["sex"].map(sex_map)
df["mstatus"] = df["mstatus"].map(mstatus_map)
df["occupation"] = df["occupation"].map(occupation_map)
df["education"] = df["education"].map(education_map)

In [11]:
X = df[list(set(df.columns) - set(["index", "decision"]))]
y = df["decision"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

In [12]:
y.value_counts()

None    828
A       133
B        39
Name: decision, dtype: int64

In [13]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
X_resampled, y_resampled = RandomOverSampler(ratio={"A":150, "B":100}).fit_sample(X_train, y_train)
X_resampled, y_resampled = RandomUnderSampler(ratio={"None":250}).fit_sample(X_resampled, y_resampled)
print(sorted(Counter(y_resampled).items()))

[('A', 150), ('B', 100), ('None', 250)]


In [14]:
model = RandomForestClassifier(n_estimators=1000, max_features=None)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))

[[ 15   0  11]
 [  0   4   4]
 [ 35   8 123]]
             precision    recall  f1-score   support

          A       0.30      0.58      0.39        26
          B       0.33      0.50      0.40         8
       None       0.89      0.74      0.81       166

avg / total       0.79      0.71      0.74       200

0.71
