In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn import metrics, preprocessing
from sklearn.cross_validation import cross_val_score

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

# Load dataset for training purposes from CSV

In [2]:
train_df = pd.read_csv('./data/orders_train.txt', sep=';', low_memory=False, parse_dates=['orderDate', 'deliveryDate', 'dateOfBirth', 'creationDate'], infer_datetime_format=True)

# Load dataset for classification from CSV

In [3]:
test_df = pd.read_csv('./data/orders_class.txt', sep=';', low_memory=False, parse_dates=['orderDate', 'deliveryDate', 'dateOfBirth', 'creationDate'], infer_datetime_format=True)

# Load benchmark dataset

In [4]:
test_labels = pd.read_csv('./data/DMC_2014_realclass.txt', sep=';', low_memory=False)

# Add missed column in test dataset (for data type compatibility)

In [5]:
test_df['df_type'] = 'test'
train_df['df_type'] = 'train'
test_df['returnShipment'] = 0

# Join two datasets two datasets together for pre-processing

In [6]:
joined_df = pd.concat([train_df, test_df])

# Replace unknown dates with NaN

In [7]:
joined_df.dateOfBirth = pd.to_datetime(joined_df.dateOfBirth, format='%Y-%m-%d', errors='coerce')
joined_df.deliveryDate = pd.to_datetime(joined_df.deliveryDate, format='%Y-%m-%d', errors='coerce')
joined_df.creationDate = pd.to_datetime(joined_df.creationDate, format='%Y-%m-%d', errors='coerce')

# Rename column names - size is Python/Pandas built-in method

In [8]:
joined_df.rename(columns={'size': 'item_size'}, inplace=True)

# Encoding categorical variables

## Salutation

In [9]:
salutation_encoder = preprocessing.LabelEncoder()
joined_df.salutation = salutation_encoder.fit_transform(joined_df.salutation)

## Color

In [10]:
color_encoder = preprocessing.LabelEncoder()
joined_df.color = color_encoder.fit_transform(joined_df.color)

## Size

In [11]:
size_encoder = preprocessing.LabelEncoder()
joined_df.item_size = size_encoder.fit_transform(joined_df.item_size)

## State

In [12]:
state_encoder = preprocessing.LabelEncoder()
joined_df.state = state_encoder.fit_transform(joined_df.state)

# Adding age column

In [13]:
joined_df['years_age'] = (joined_df.orderDate - joined_df.dateOfBirth).astype('<m8[Y]')

# Adding month number column

In [14]:
joined_df['order_month'] = joined_df.orderDate.dt.month

# Adding day of week column

In [15]:
joined_df['order_weekday'] = joined_df.orderDate.dt.dayofweek

# Add price bin column

In [16]:
joined_df['price_bin'] = joined_df.price.apply(lambda x: int(round(x/10))*10)

# Add column with shipment return result to test dataframe part

In [17]:
joined_df.loc[joined_df['df_type'] == 'test', 'returnShipment'] = test_labels['returnShipment']

# Fill all N/As

In [23]:
joined_df.fillna(0, inplace=True)

# Split preprocessed dataset into training and testing parts and their label

In [34]:
X_train = joined_df.query('df_type == "train"').drop(['creationDate', 'dateOfBirth', 'orderItemID', 'orderDate', 
                                                     'deliveryDate', 'itemID', 'price', 'customerID', 'returnShipment', 'df_type'], axis=1)
y_train = joined_df.query('df_type == "train"')['returnShipment']

In [35]:
X_test = joined_df.query('df_type == "test"').drop(['creationDate', 'dateOfBirth', 'orderItemID', 'orderDate', 
                                                     'deliveryDate', 'itemID', 'price', 'customerID', 'returnShipment', 'df_type'], axis=1)
y_test = joined_df.query('df_type == "test"')['returnShipment']

# Train Logistic Regression model

In [39]:
regression_model = LogisticRegression()
regression_model = regression_model.fit(X_train, y_train)

# Check the accuracy of Logistic Regression

In [40]:
regression_model.score(X_test, y_test)

0.56110467670434128

# Train Random Forest model

In [49]:
forest_model = RandomForestClassifier(n_estimators = 20)
forest_model = forest_model.fit(X_train, y_train)

# Check the accuracy of Random Forest classification

In [50]:
forest_model.score(X_test, y_test)

0.5454091617077359

In [None]:
svm_model = svm.LinearSVC()
svm_model.fit(X_train, y_train)

In [None]:
svm_model.score(X_test, y_test)