In [291]:
import joblib

In [292]:
import numpy as np

# Load data

In [293]:
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

In [294]:
X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

In [295]:
X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

# Logistic Regression

In [296]:
from sklearn.linear_model import LogisticRegression

In [297]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score

In [298]:
C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 3.0, 5.0, 10.0, 100]

In [299]:
models = []
train_f1_score = []
cv_f1_score = []

for C in C_list:
    log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced", C=C)
    log_reg.fit(X_train, y_train)
    models.append(log_reg)

    y_train_pred = log_reg.predict(X_train)
    train_f1_score.append(f1_score(y_train, y_train_pred))
    
    y_cv_pred = log_reg.predict(X_cv)
    cv_f1_score.append(f1_score(y_cv, y_cv_pred))

In [300]:
print(train_f1_score)

[0.6060179977502813, 0.6328426468486494, 0.6323885486397949, 0.6320741268709906, 0.6323445521962351, 0.6323445521962351, 0.6323445521962351, 0.6323445521962351, 0.6323445521962351]


In [301]:
print(cv_f1_score)

[0.5688967656180771, 0.5926589077887198, 0.5948121645796064, 0.5929558626839054, 0.5920641997325011, 0.5911725367810967, 0.5911725367810967, 0.5911725367810967, 0.5911725367810967]


In [302]:
cv_f1_score[np.argmax(cv_f1_score)]

0.5948121645796064

In [303]:
for model in models:
    y_test_pred = model.predict(X_test)
    print(f1_score(y_test, y_test_pred))

0.5938037865748709
0.6155172413793103
0.6101254868022501
0.6104626026805015
0.6104626026805015
0.6104626026805015
0.6104626026805015
0.6104626026805015
0.6104626026805015


In [304]:
best_model = models[np.argmax(cv_f1_score)]

In [305]:
best_model

LogisticRegression(C=0.01, class_weight='balanced', max_iter=1000,
                   random_state=42)

In [306]:
best_model.coef_

array([[-3.52990989e-01, -6.70458535e-02, -7.99212582e-02,
         1.57633218e-01,  3.50524215e-03, -1.50274944e-01,
        -4.72939663e-03,  1.28751622e-01, -1.96234491e-04,
         2.61651573e-02,  3.45196861e-02,  3.20104431e-01,
        -3.56346549e-02,  1.80178731e-02, -2.18860096e-02,
         1.34066492e-02, -5.08135459e-03,  1.89840182e-02,
         1.83356748e-02, -5.72910332e-03, -4.79713611e-02,
         6.27608039e-03, -3.51056234e-02, -6.70423391e-02,
         1.76926305e-02,  5.09171766e-02,  7.68452591e-02,
        -6.46848254e-03, -9.75514991e-03,  9.75514991e-03,
         1.70743348e-01, -1.70743348e-01, -1.87934492e-01,
         1.87934492e-01, -9.46685999e-02, -3.80063104e-02,
         1.48879919e-02,  4.28735565e-02, -8.39846854e-02,
         6.93374645e-02, -6.12777441e-02, -7.23834552e-02,
        -2.80841720e-02, -1.74041842e-03, -1.98861696e-02,
         1.37275099e-01, -5.29573148e-02, -1.13619159e-01,
         3.78092937e-02, -3.09922900e-02,  2.33951677e-0

city_development_index is the most important factor

In [326]:
print(np.argsort(np.abs(best_model.coef_)))

[[ 62  65   8 118  43  52  50 135  60  83 150 132 103   4 146 113   6  16
   82  19  54  85 140  21  27 138 137 143  78  92  88  84 115  72 112  66
   28  29 114 129  51 153 149  98 109  15 117  97 148  36 144  77 121  69
   24  68  13  18  99  17  89  55  71  44  91 100 152 130  14  56 119  87
  110 111   9 106 105 145  42 102  75 101  74 108  49  57  73  10  22  12
  142  48  35 122  67 147  58  93  90 104  86  37 131 120  20  25  96  76
  128  46  59 116  61  70  53  79 124 125 123  94  40 139 141 151  23   1
   81  39 133  41 134 126  26 107   2  38 127  34  63  64  47 136   7  45
   80   5   3  31  30  32  33  11   0  95]]


In [324]:
np.abs(best_model.coef_).argmax(axis=1)

array([95], dtype=int32)

In [308]:
abs(best_model.coef_).max(axis=1)

array([0.43924828])

In [309]:
y_test_pred = best_model.predict(X_test)
print(f1_score(y_test, y_test_pred))

0.6101254868022501


In [310]:
joblib.dump(best_model, 'Models/best_model.pkl')

['Models/best_model.pkl']

# SGDClassifier

In [311]:
from sklearn.linear_model import SGDClassifier

In [312]:
eta0_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 3.0, 5.0, 10.0]
alpha_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 3.0, 5.0, 10.0]

In [313]:
sgd_models = []
sgd_train_f1_score = []
sgd_cv_f1_score = []

for eta0 in eta0_list:
    for alpha in alpha_list:
        clf = SGDClassifier(loss='log', alpha=alpha, eta0=eta0, random_state=42, learning_rate='constant',
                 class_weight='balanced')
        clf.fit(X_train, y_train)
        sgd_models.append(clf)

        y_train_pred = clf.predict(X_train)
        sgd_train_f1_score.append(f1_score(y_train, y_train_pred))
        
        y_cv_pred = clf.predict(X_cv)
        sgd_cv_f1_score.append(f1_score(y_cv, y_cv_pred))

In [314]:
print(sgd_train_f1_score)

[0.6322838011112695, 0.6322838011112695, 0.6327635327635329, 0.6314888762122076, 0.6029825548677546, 0.5753908905506459, 0.5676004872107187, 0.5822492049725354, 0.6293364042032531, 0.6293364042032531, 0.6296829971181556, 0.6282219309742246, 0.5842017822081257, 0.5686635944700461, 0.5536881419234361, 0.5188876386433049, 0.5715044482804408, 0.5709728867623605, 0.5678945953213229, 0.5482491312483294, 0.497638275179034, 0.41552279274699205, 0.37778968041421174, 0.3316919440614536, 0.4692410768668534, 0.4790404652289799, 0.49434656849855385, 0.45394091355063776, 0.481746234363033, 0.40115084258117545, 0.3597135512302608, 0.18467731790162287, 0.43480305702527927, 0.4979570990806945, 0.5080794090489381, 0.2025087288245183, 0.21714285714285717, 0.21714285714285717, 0.21714285714285717, 0.21714285714285717, 0.5044201768070722, 0.4684524565189793, 0.3847874720357942, 0.40369567266899226, 0.32380952380952377, 0.32380952380952377, 0.32380952380952377, 0.32380952380952377, 0.5025100401606425, 0.328

In [315]:
print(sgd_cv_f1_score)

[0.5924276169265034, 0.5926916221033868, 0.5931219294327825, 0.5924932975871313, 0.568888888888889, 0.5450680272108843, 0.5372250423011845, 0.5505819158460161, 0.5883408071748879, 0.588077095472882, 0.5872302158273381, 0.5822323462414578, 0.5443338074917022, 0.523444976076555, 0.5107421875, 0.47225025227043393, 0.5287739783152626, 0.5280334728033473, 0.525559780312632, 0.5107278081615482, 0.47024673439767783, 0.3921775898520085, 0.3590604026845638, 0.3096129837702872, 0.44302369311771345, 0.4558383233532934, 0.4611549646863316, 0.4268986283037805, 0.45181927229108354, 0.3940824677368587, 0.35373009220452645, 0.1942472917444901, 0.38960113960113957, 0.4778761061946903, 0.48790746582544686, 0.2100673801030519, 0.23297137216189534, 0.23297137216189534, 0.23297137216189534, 0.23297137216189534, 0.47432762836185827, 0.4374250299880048, 0.37666126418152346, 0.393073593073593, 0.3232804232804233, 0.3232804232804233, 0.3232804232804233, 0.3232804232804233, 0.471690745802421, 0.3407079646017699

In [316]:
best_sgd_model = sgd_models[np.argmax(sgd_cv_f1_score)]

In [317]:
sgd_train_f1_score[np.argmax(sgd_cv_f1_score)]

0.6327635327635329

In [318]:
sgd_cv_f1_score[np.argmax(sgd_cv_f1_score)]

0.5931219294327825

In [319]:
best_sgd_model

SGDClassifier(alpha=0.01, class_weight='balanced', eta0=0.0001,
              learning_rate='constant', loss='log', random_state=42)

In [320]:
y_test_pred = best_sgd_model.predict(X_test)
print(f1_score(y_test, y_test_pred))

0.6098615916955017


In [321]:
print(confusion_matrix(y_test, y_test_pred))

[[2225  655]
 [ 247  705]]


In [322]:
accuracy_score(y_test, y_test_pred)

0.7646137787056367

In [323]:
joblib.dump(best_sgd_model, "Models/best_sgd_model.pkl")

['Models/best_sgd_model.pkl']