In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn import metrics, preprocessing
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline

# Load dataset for training purposes from CSV

In [2]:
train_df = pd.read_csv('./data/orders_train.txt', sep=';', low_memory=False, 
                       parse_dates=['orderDate', 'deliveryDate', 'dateOfBirth', 'creationDate'], 
                       infer_datetime_format=True)

# Load dataset for classification from CSV

In [3]:
test_df = pd.read_csv('./data/orders_class.txt', sep=';', low_memory=False, 
                      parse_dates=['orderDate', 'deliveryDate', 'dateOfBirth', 'creationDate'], 
                      infer_datetime_format=True)

# Load benchmark dataset

In [4]:
test_labels = pd.read_csv('./data/DMC_2014_realclass.txt', sep=';', low_memory=False)

# Add missed column in test dataset (for data type compatibility)

In [5]:
test_df['df_type'] = 'test'
train_df['df_type'] = 'train'
test_df['returnShipment'] = 0

# Join two datasets two datasets together for pre-processing

In [6]:
joined_df = pd.concat([train_df, test_df])

In [7]:
joined_df

Unnamed: 0,color,creationDate,customerID,dateOfBirth,deliveryDate,df_type,itemID,manufacturerID,orderDate,orderItemID,price,returnShipment,salutation,size,state
0,denim,2011-04-25,794,1965-01-06,2012-04-03,train,186,25,2012-04-01,1,69.90,0,Mrs,m,Baden-Wuerttemberg
1,ocher,2011-04-25,794,1965-01-06,2012-04-03,train,71,21,2012-04-01,2,69.95,1,Mrs,9+,Baden-Wuerttemberg
2,curry,2011-04-25,794,1965-01-06,2012-04-03,train,71,21,2012-04-01,3,69.95,1,Mrs,9+,Baden-Wuerttemberg
3,green,2012-01-04,808,1959-11-09,?,train,22,14,2012-04-02,4,39.90,0,Mrs,m,Saxony
4,black,2011-02-16,825,1964-07-11,1990-12-31,train,151,53,2012-04-02,5,29.90,0,Mrs,39,Rhineland-Palatinate
5,brown,2011-02-16,825,1964-07-11,1990-12-31,train,598,87,2012-04-02,6,89.90,0,Mrs,xxl,Rhineland-Palatinate
6,black,2011-02-16,825,1964-07-11,1990-12-31,train,15,1,2012-04-02,7,129.90,0,Mrs,39,Rhineland-Palatinate
7,brown,2011-02-16,850,1948-04-08,2012-04-03,train,32,3,2012-04-02,8,21.90,1,Mrs,xxl,North Rhine-Westphalia
8,red,2011-02-16,850,1948-04-08,2012-04-03,train,32,3,2012-04-02,9,21.90,1,Mrs,xxl,North Rhine-Westphalia
9,green,2011-02-16,850,1948-04-08,2012-04-03,train,57,3,2012-04-02,10,39.90,1,Mrs,xxl,North Rhine-Westphalia


# Replace unknown dates with NaN

In [8]:
joined_df.dateOfBirth = pd.to_datetime(joined_df.dateOfBirth, format='%Y-%m-%d', errors='coerce')
joined_df.deliveryDate = pd.to_datetime(joined_df.deliveryDate, format='%Y-%m-%d', errors='coerce')
joined_df.creationDate = pd.to_datetime(joined_df.creationDate, format='%Y-%m-%d', errors='coerce')

# Rename column names - size is Python/Pandas built-in method

In [9]:
joined_df.rename(columns={'size': 'item_size'}, inplace=True)

# Encoding categorical variables

## - Salutation / Gender

In [10]:
salutation_encoder = preprocessing.LabelEncoder()
joined_df.salutation = salutation_encoder.fit_transform(joined_df.salutation)

## - Color

In [11]:
color_encoder = preprocessing.LabelEncoder()
joined_df.color = color_encoder.fit_transform(joined_df.color)

## - Size

In [12]:
size_encoder = preprocessing.LabelEncoder()
joined_df.item_size = size_encoder.fit_transform(joined_df.item_size)

## - State

In [13]:
state_encoder = preprocessing.LabelEncoder()
joined_df.state = state_encoder.fit_transform(joined_df.state)

# Adding age column

In [14]:
joined_df['age_years'] = (joined_df.orderDate - joined_df.dateOfBirth).astype('timedelta64[Y]')
joined_df.age_years.fillna(-1, inplace=True)
joined_df.age_years = joined_df.age_years.astype(int)
joined_df.age_years = joined_df.age_years.apply(lambda x: -1 if x < 0 else x)

# Adding month number column

In [15]:
joined_df['order_month'] = joined_df.orderDate.dt.month

# Adding day of week column

In [16]:
joined_df['order_weekday'] = joined_df.orderDate.dt.dayofweek

# Add price bin column

In [17]:
joined_df['price_bin'] = joined_df.price.apply(lambda x: int(round(x/100))*100)

# Add column with days between order and delivery

In [18]:
joined_df['delivery_duration'] = (joined_df.deliveryDate - joined_df.orderDate).astype('timedelta64[D]')
joined_df.delivery_duration.fillna(-1, inplace=True)
joined_df.delivery_duration = joined_df.delivery_duration.astype(int)
joined_df.delivery_duration = joined_df.delivery_duration.apply(lambda x: -1 if x < 0 else x)

# Add column with days between order and registration

In [19]:
joined_df['registration_duration'] = (joined_df.orderDate - joined_df.creationDate).astype('timedelta64[D]')
joined_df.registration_duration.fillna(-1, inplace=True)
joined_df.registration_duration = joined_df.registration_duration.astype(int)
joined_df.registration_duration = joined_df.registration_duration.apply(lambda x: -1 if x < 0 else x)

# Add column with number of items in entire order

In [20]:
#orders_count_df = joined_df.groupby(['customerID', 'orderDate']).agg({'orderItemID': 'count'})
#orders_count_df.columns = ['orders_count']
#joined_df = joined_df.set_index(['customerID', 'orderDate'])
#joined_df = joined_df.merge(orders_count_df, how='left', left_index=True, right_index=True).reset_index()

# Add column with shipment return result to test dataframe part

In [21]:
joined_df.loc[joined_df['df_type'] == 'test', 'returnShipment'] = test_labels['returnShipment']

# Fill all N/As

In [22]:
joined_df.fillna(0, inplace=True)

In [23]:
joined_df

Unnamed: 0,color,creationDate,customerID,dateOfBirth,deliveryDate,df_type,itemID,manufacturerID,orderDate,orderItemID,...,returnShipment,salutation,item_size,state,age_years,order_month,order_weekday,price_bin,delivery_duration,registration_duration
0,43,2011-04-25,794,1965-01-06,2012-04-03,train,186,25,2012-04-01,1,...,0,3,115,0,47,4,6,100,2,342
1,69,2011-04-25,794,1965-01-06,2012-04-03,train,71,21,2012-04-01,2,...,1,3,102,0,47,4,6,100,2,342
2,36,2011-04-25,794,1965-01-06,2012-04-03,train,71,21,2012-04-01,3,...,1,3,102,0,47,4,6,100,2,342
3,50,2012-01-04,808,1959-11-09,1970-01-01,train,22,14,2012-04-02,4,...,0,3,115,12,52,4,0,0,-1,89
4,18,2011-02-16,825,1964-07-11,1990-12-31,train,151,53,2012-04-02,5,...,0,3,59,10,47,4,0,0,-1,411
5,23,2011-02-16,825,1964-07-11,1990-12-31,train,598,87,2012-04-02,6,...,0,3,120,10,47,4,0,100,-1,411
6,18,2011-02-16,825,1964-07-11,1990-12-31,train,15,1,2012-04-02,7,...,0,3,59,10,47,4,0,100,-1,411
7,23,2011-02-16,850,1948-04-08,2012-04-03,train,32,3,2012-04-02,8,...,1,3,120,9,63,4,0,0,1,411
8,79,2011-02-16,850,1948-04-08,2012-04-03,train,32,3,2012-04-02,9,...,1,3,120,9,63,4,0,0,1,411
9,50,2011-02-16,850,1948-04-08,2012-04-03,train,57,3,2012-04-02,10,...,1,3,120,9,63,4,0,0,1,411


# Scaling, training, classifying

In [24]:
X = joined_df.drop(['creationDate', 'dateOfBirth', 'orderItemID', 'orderDate', 
                  'deliveryDate', 'customerID'], axis=1)

classifier_names = ['Logistic Regression', 'Random Forest', 'Decision Tree', 'Linear SVC', 'AdaBoost Classifier', 
               '5 Nearest Neighbors', 'GradientBoosting Classifier']
classifier_objects = [LogisticRegression(), RandomForestClassifier(n_estimators=20, max_depth=5), 
                      DecisionTreeClassifier(max_depth=5), 
                      svm.LinearSVC(), AdaBoostClassifier(), KNeighborsClassifier(5), 
                      GradientBoostingClassifier(n_estimators=20, learning_rate=1.0, max_depth=1, random_state=0)]

scaler_names = ['Without Scaling', 'Standard Scaler', 'MinMax Scaler', 'Robust Scaler']
scaler_objects = [False, StandardScaler(), MinMaxScaler(), RobustScaler()]

features = list(X.columns)
for drop_column in ['df_type', 'returnShipment']:
    features.remove(drop_column)
print("Features: {}\n".format(', '.join(features)))

for scaler_name, scaler in zip(scaler_names, scaler_objects):
    print("Dataset preprocessed: {}".format(scaler_name))
    if scaler:
        X[features] = scaler.fit_transform(X[features])

    for clf_name, clf in zip(classifier_names, classifier_objects):
        X_train = X[X['df_type'] == "train"][features]
        y_train = X[X['df_type'] == "train"]['returnShipment']
        X_test = X[X['df_type'] == "test"][features]
        y_test = X[X['df_type'] == "test"]['returnShipment']

        clf.fit(X_train, y_train)
        clf_score = clf.score(X_test, y_test) * 100
        print("\tAccuracy of {0} classifier: {1:.3f}%".format(clf_name, clf_score))
    print("\n")

Features: color, itemID, manufacturerID, price, salutation, item_size, state, age_years, order_month, order_weekday, price_bin, delivery_duration, registration_duration

Dataset preprocessed: Without Scaling
	Accuracy of Logistic Regression classifier: 56.775%
	Accuracy of Random Forest classifier: 64.320%
	Accuracy of Decision Tree classifier: 64.220%
	Accuracy of Linear SVC classifier: 50.701%
	Accuracy of AdaBoost Classifier classifier: 64.412%
	Accuracy of 5 Nearest Neighbors classifier: 56.560%
	Accuracy of GradientBoosting Classifier classifier: 64.070%


Dataset preprocessed: Standard Scaler
	Accuracy of Logistic Regression classifier: 56.781%
	Accuracy of Random Forest classifier: 64.431%
	Accuracy of Decision Tree classifier: 64.220%
	Accuracy of Linear SVC classifier: 56.757%
	Accuracy of AdaBoost Classifier classifier: 64.412%
	Accuracy of 5 Nearest Neighbors classifier: 54.795%
	Accuracy of GradientBoosting Classifier classifier: 64.070%


Dataset preprocessed: MinMax Scale