In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn import metrics, preprocessing
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline

# Load dataset for training purposes from CSV

In [2]:
train_df = pd.read_csv('./data/orders_train.txt', sep=';', low_memory=False, 
                       parse_dates=['orderDate', 'deliveryDate', 'dateOfBirth', 'creationDate'], 
                       infer_datetime_format=True)

# Load dataset for classification from CSV

In [3]:
test_df = pd.read_csv('./data/orders_class.txt', sep=';', low_memory=False, 
                      parse_dates=['orderDate', 'deliveryDate', 'dateOfBirth', 'creationDate'], 
                      infer_datetime_format=True)

# Load benchmark dataset

In [4]:
test_labels = pd.read_csv('./data/DMC_2014_realclass.txt', sep=';', low_memory=False)

# Add missed column in test dataset (for data type compatibility)

In [5]:
test_df['df_type'] = 'test'
train_df['df_type'] = 'train'
test_df['returnShipment'] = 0

# Join two datasets two datasets together for pre-processing

In [6]:
joined_df = pd.concat([train_df, test_df])

In [None]:
joined_df

# Replace unknown dates with NaN

In [7]:
joined_df.dateOfBirth = pd.to_datetime(joined_df.dateOfBirth, format='%Y-%m-%d', errors='coerce')
joined_df.deliveryDate = pd.to_datetime(joined_df.deliveryDate, format='%Y-%m-%d', errors='coerce')
joined_df.creationDate = pd.to_datetime(joined_df.creationDate, format='%Y-%m-%d', errors='coerce')

# Rename column names - size is Python/Pandas built-in method

In [8]:
joined_df.rename(columns={'size': 'item_size'}, inplace=True)

# Encoding categorical variables

## - Salutation / Gender

In [9]:
salutation_encoder = preprocessing.LabelEncoder()
joined_df.salutation = salutation_encoder.fit_transform(joined_df.salutation)

## - Color

In [10]:
color_encoder = preprocessing.LabelEncoder()
joined_df.color = color_encoder.fit_transform(joined_df.color)

## - Size

In [11]:
size_encoder = preprocessing.LabelEncoder()
joined_df.item_size = size_encoder.fit_transform(joined_df.item_size)

## - State

In [12]:
state_encoder = preprocessing.LabelEncoder()
joined_df.state = state_encoder.fit_transform(joined_df.state)

# Adding age column

In [13]:
joined_df['age_years'] = (joined_df.orderDate - joined_df.dateOfBirth).astype('<m8[Y]')

# Adding month number column

In [14]:
joined_df['order_month'] = joined_df.orderDate.dt.month

# Adding day of week column

In [15]:
joined_df['order_weekday'] = joined_df.orderDate.dt.dayofweek

# Add price bin column

In [16]:
joined_df['price_bin'] = joined_df.price.apply(lambda x: int(round(x/100))*100)

# Add column with days between order and delivery

In [None]:
joined_df['delivery_duration'] = (joined_df.orderDate - joined_df.deliveryDate).astype('timedelta64[D]').astype(int)

# Add column with shipment return result to test dataframe part

In [17]:
joined_df.loc[joined_df['df_type'] == 'test', 'returnShipment'] = test_labels['returnShipment']

# Fill all N/As

In [18]:
joined_df.fillna(0, inplace=True)

In [None]:
joined_df

# Scaling, training, classifying

In [19]:
X = joined_df.drop(['creationDate', 'dateOfBirth', 'orderItemID', 'orderDate', 
                  'deliveryDate', 'customerID'], axis=1)

classifier_names = ['Logistic Regression', 'Random Forest', 'Decision Tree', 'Linear SVC', 'AdaBoost Classifier', 
               '5 Nearest Neighbors', 'GradientBoosting Classifier']
classifier_objects = [LogisticRegression(), RandomForestClassifier(n_estimators=100), 
                      DecisionTreeClassifier(max_depth=5), 
                      svm.LinearSVC(), AdaBoostClassifier(), KNeighborsClassifier(5), 
                      GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)]

scaler_names = ['Without Scaling', 'Standard Scaler', 'MinMax Scaler']
scaler_objects = [False, StandardScaler(), MinMaxScaler()]

features = list(X.columns)
for drop_column in ['df_type', 'returnShipment']:
    features.remove(drop_column)
print("Features: {}\n".format(', '.join(features)))

for scaler_name, scaler in zip(scaler_names, scaler_objects):
    print("Dataset preprocessed by: {}".format(scaler_name))
    if scaler:
        X[features] = scaler.fit_transform(X[features])

    for clf_name, clf in zip(classifier_names, classifier_objects):
        X_train = X[X['df_type'] == "train"][features]
        y_train = X[X['df_type'] == "train"]['returnShipment']
        X_test = X[X['df_type'] == "test"][features]
        y_test = X[X['df_type'] == "test"]['returnShipment']

        clf.fit(X_train, y_train)
        clf_score = clf.score(X_test, y_test) * 100
        print("\tAccuracy of {0} classifier: {1:.3f}%".format(clf_name, clf_score))
    print("\n")

Features: color, itemID, manufacturerID, price, salutation, item_size, state, years_age, order_month, order_weekday, price_bin

Dataset preprocessed by: No Scaling
	Accuracy of Logistic Regression classifier: 55.683%
	Accuracy of Random Forest classifier: 57.406%
	Accuracy of Decision Tree classifier: 56.869%
	Accuracy of Linear SVC classifier: 51.831%
	Accuracy of AdaBoost Classifier classifier: 56.398%
	Accuracy of 5 Nearest Neighbors classifier: 55.965%
	Accuracy of GradientBoosting Classifier classifier: 56.783%


Dataset preprocessed by: Standard Scaler
	Accuracy of Logistic Regression classifier: 55.753%
	Accuracy of Random Forest classifier: 57.382%
	Accuracy of Decision Tree classifier: 56.869%
	Accuracy of Linear SVC classifier: 55.699%
	Accuracy of AdaBoost Classifier classifier: 56.398%
	Accuracy of 5 Nearest Neighbors classifier: 54.014%
	Accuracy of GradientBoosting Classifier classifier: 56.783%


Dataset preprocessed by: MinMax Scaler
	Accuracy of Logistic Regression cla