In [1]:
import pandas as pd, numpy as np
train = pd.read_csv('../input/training.csv')
test = pd.read_csv('../input/testing.csv')

In [2]:
test['Made Donation in March 2007'] = 'NaN'
data = train.append(test)

# feature engineering

data['Months Donating'] = data['Months since First Donation'] - data['Months since Last Donation']

data['Donations per Months Donating'] = data["""Total Volume Donated (c.c.)"""]/data['Months Donating']
data['Donations per Months Donating'] = data['Donations per Months Donating'].replace(np.inf, 999)

data['Donations per Months since First Donation'] = data["""Total Volume Donated (c.c.)"""]/data['Months since First Donation']

data['Donation Counts per Months Donating'] = data['Number of Donations']/data['Months Donating']
data['Donation Counts per Months Donating'] = data['Donation Counts per Months Donating'].replace(np.inf, 999)

data['Donation Counts per Months since First Donating'] = data['Number of Donations']/data['Months since First Donation']
data['Donation Counts per Months since First Donating'] = data['Donation Counts per Months since First Donating'].replace(np.inf, 999)

data['Donation Volume per Donation'] = (data["""Total Volume Donated (c.c.)"""]/data['Number of Donations']).replace(np.inf, 999)
data['Unknown per Donation'] = (data["Unnamed: 0"]/data['Number of Donations']).replace(np.inf, 999)

In [3]:
test = data[data['Made Donation in March 2007'] == 'NaN']
test.drop(["Made Donation in March 2007"], axis = 1)
train = data[data['Made Donation in March 2007'] != 'NaN']

In [4]:
X = train.drop(['Made Donation in March 2007'], axis = 1)
y = train['Made Donation in March 2007']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=42)



In [5]:
from catboost import Pool, CatBoostClassifier

train_pool = Pool(X_train, y_train, cat_features = [])
test_pool = Pool(X_test, y_test, cat_features = [])

model = CatBoostClassifier(
    depth = 4,
    random_seed = 42, 
    eval_metric = 'AUC',
    iterations = 1000,
    class_weights = [1, 3],
    verbose = True,
    loss_function= 'Logloss'
     )

model.fit(
    train_pool, 
    cat_features = None,
    eval_set = test_pool, 
    use_best_model = True,
    verbose = 100
    )

Learning rate set to 0.071919
0:	test: 0.7401902	best: 0.7401902 (0)	total: 73.4ms	remaining: 1m 13s
100:	test: 0.7923107	best: 0.8030123 (43)	total: 909ms	remaining: 8.09s
200:	test: 0.7695204	best: 0.8030123 (43)	total: 1.53s	remaining: 6.07s
300:	test: 0.7643678	best: 0.8030123 (43)	total: 2.22s	remaining: 5.16s
400:	test: 0.7623860	best: 0.8030123 (43)	total: 2.9s	remaining: 4.33s
500:	test: 0.7639715	best: 0.8030123 (43)	total: 3.59s	remaining: 3.58s
600:	test: 0.7604043	best: 0.8030123 (43)	total: 4.27s	remaining: 2.84s
700:	test: 0.7643678	best: 0.8030123 (43)	total: 4.95s	remaining: 2.11s
800:	test: 0.7683314	best: 0.8030123 (43)	total: 5.61s	remaining: 1.39s
900:	test: 0.7671423	best: 0.8030123 (43)	total: 6.29s	remaining: 691ms
999:	test: 0.7631788	best: 0.8030123 (43)	total: 6.97s	remaining: 0us

bestTest = 0.803012287
bestIteration = 43

Shrink model to first 44 iterations.


<catboost.core.CatBoostClassifier at 0x7f544c8535c0>

In [6]:
'''
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train)
'''

'\nfrom xgboost import XGBClassifier\nmodel = XGBClassifier()\nmodel.fit(X_train, y_train)\n'

In [7]:
# predictions
predictions = model.predict(X_test).astype('int')
predictions_probs = model.predict_proba(X_test)
y_test = y_test.astype('int')

In [8]:
# MODEL EVALUATION

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
print('Accuracy: ', str(accuracy_score(y_test, predictions)))
print('Precision: ', str(precision_score(y_test, predictions)))
print('Recall: ', str(recall_score(y_test, predictions)))
print('F1: ', str(f1_score(y_test, predictions)))
print('Area under ROC Curve: ', str(roc_auc_score(y_test, predictions_probs[:,1])))
print('GINI: ', str(-1 + 2*roc_auc_score(y_test, predictions_probs[:,1])))

tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

print('True Negatives: ', str(tn))
print('True Positives: ', str(tp))
print('False Negatives: ', str(fn))
print('False Positives: ', str(fp))

Accuracy:  0.7155172413793104
Precision:  0.45454545454545453
Recall:  0.6896551724137931
F1:  0.547945205479452
Area under ROC Curve:  0.8030122869599683
GINI:  0.6060245739199366
True Negatives:  63
True Positives:  20
False Negatives:  9
False Positives:  24


In [9]:
feature_importance = model.get_feature_importance(train_pool)
feature_names = X_train.columns
feature_imp = pd.DataFrame([feature_names, feature_importance])
final = feature_imp.transpose()
final.sort_values(by = 1, ascending = False, inplace = True)
pd.set_option('display.max_colwidth', -1)
final.head(500)

Unnamed: 0,0,1
7,Donations per Months since First Donation,19.2149
9,Donation Counts per Months since First Donating,17.0881
1,Months since Last Donation,16.8029
6,Donations per Months Donating,8.45441
0,Unnamed: 0,8.01891
11,Unknown per Donation,6.70678
5,Months Donating,6.20775
2,Number of Donations,4.97183
4,Months since First Donation,4.88837
3,Total Volume Donated (c.c.),4.60293
