In [28]:
# Import refill data
import numpy as np
import pandas as pd
refill_data = pd.read_csv('refill_data.csv')
refill_data.describe()

Unnamed: 0,RefillNumber,RefillAmount,AdvancedNotified,KitPurchasePrice,CustomerIsCorporate,PreviousRefillsAmount,DaysSinceLastRefill,DaysSinceKitPurchase
count,47755.0,47755.0,47755.0,35350.0,47755.0,47755.0,44667.0,47755.0
mean,8.749702,223.675246,0.461397,376.732818,0.133808,1684.394852,120.098731,1914.410931
std,5.658008,164.792918,0.498513,315.807762,0.340449,1131.699327,106.832913,1807.472788
min,1.0,29.45,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,75.5,0.0,120.8,0.0,703.44,63.0,943.0
50%,8.0,141.7,0.0,333.35,0.0,1686.75,99.0,1412.0
75%,12.0,322.61,1.0,533.49,0.0,2351.93,154.0,1947.0
max,31.0,1194.06,1.0,3144.95,1.0,7272.91,2872.0,42464.0


In [29]:
# Separate data into features (X) and target (y)
X_raw = refill_data[list(refill_data.columns[:-1])]
y_all = refill_data[refill_data.columns[-1]]
print X_raw.shape
print y_all.shape

(47755, 10)
(47755,)


In [30]:
# Investigate problem severity
from IPython.display import display
print "Total refills: {}".format(y_all.count())
print "Service level changes within 45 days of refill:"
display(100*y_all.value_counts()/y_all.count())

Total refills: 47755
Service level changes within 45 days of refill:


No change      97.742645
Downgraded      1.972568
Deactivated     0.284787
Name: ServiceLevelChange, dtype: float64

In [31]:
# Preprocess features
from sklearn.preprocessing import Imputer

# Split categorical features into multiple binary columns
X_encoded = pd.DataFrame(index=X_raw.index)

for col, col_data in X_raw.iteritems():
    if col_data.dtype == object:
        col_data = pd.get_dummies(col_data, prefix=col)
    X_encoded = X_encoded.join(col_data)
    
# TODO
# Split by class to impute values
# Impute numerical with mean, categorical with mode
# Transform DaysSinceLastRefill to reciprocal, fill NULL with 0
X_all = pd.DataFrame(Imputer().fit_transform(X_encoded), index=X_encoded.index, columns=X_encoded.columns)
X_all.head()

Unnamed: 0,RefillNumber,RefillAmount,AdvancedNotified,KitType_A,KitType_B,KitType_C,KitType_D,KitType_E,KitType_F,KitType_G,...,KitType_S,KitType_T,KitPurchasePrice,KitPurchaseSalesPerson_DEALER,KitPurchaseSalesPerson_DIRECT,KitPurchaseSalesPerson_WEB,CustomerIsCorporate,PreviousRefillsAmount,DaysSinceLastRefill,DaysSinceKitPurchase
0,18,255.5,1,0,0,0,0,0,0,1,...,0,0,376.732818,0,0,0,0,3789.89,250,3217
1,23,323.4,1,0,0,0,0,0,0,0,...,0,0,376.732818,0,0,0,0,5302.89,94,3163
2,18,336.8,0,0,0,0,0,0,0,1,...,0,0,376.732818,0,0,0,0,3477.8,188,3121
3,19,75.33,1,0,0,0,0,0,0,1,...,0,0,376.732818,0,0,0,0,3814.6,71,3192
4,4,49.45,0,0,0,0,0,0,0,0,...,0,0,376.732818,0,0,0,1,819.58,123,3154


In [32]:
# Encode target labels
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# y_all = le.fit_transform(y_raw)
# class_names = le.classes_
# print class_names
# print le.transform(class_names)

In [36]:
# Train classifier
from sklearn.metrics import cohen_kappa_score, f1_score, make_scorer
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

kappa_scorer = make_scorer(cohen_kappa_score)
f1_scorer = make_scorer(f1_score, labels=['Deactivated', 'Downgraded'], average='micro')
    
results = []
classifiers = [
    LinearSVC(dual=False),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    RandomForestClassifier()
]
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, stratify=y_all)
for clf in classifiers:
    classifier = clf.__class__.__name__
    train_kappa = cross_val_score(clf, X_train, y_train, scoring=kappa_scorer)
    train_f1 = cross_val_score(clf, X_train, y_train, scoring=f1_scorer)
    results.append([classifier, train_kappa.mean(), train_f1.mean()])
    
results = pd.DataFrame(
    data = results,
    columns = [
        'classifier',
        'train_kappa',
        'train_f1'
    ]
)

display(results)

Unnamed: 0,classifier,train_kappa,train_f1
0,LinearSVC,0.0,0.0
1,DecisionTreeClassifier,0.058186,0.070559
2,KNeighborsClassifier,0.006311,0.006689
3,AdaBoostClassifier,0.0,0.0
4,RandomForestClassifier,0.058803,0.066232
