In [2]:
# Load packages
import numpy as np  
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import re
import seaborn as sns
sns.set_style('whitegrid')

from sklearn import cross_validation
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

print "Read in packages from numpy, pandas, sklearn, seaborn & matplotlib"

Read in packages from numpy, pandas, sklearn, seaborn & matplotlib


In [131]:
# Load training data
dates_ = range(1, 22)
dates = ["{:02d}".format(item) for item in dates_] 

# Read in order data
order_train = pd.concat( pd.read_table('../../data/season_1/training_data/order_data/order_data_2016-01-%s' %i, header=None, names = ['order_id', 'driver_id', 'passenger_id', 'start_district_hash', 'dest_district_hash', 'Price', 'Time' ])
                      for i in dates)

# Read in poi data
poi_train = pd.read_table('../../data/season_1/training_data/poi_data/poi_data', sep=' ', header=None, names = ['district_hash'])
poi_train = pd.DataFrame(poi_train.district_hash.str.split('\W+',1).tolist(), columns = ['district_hash','poi_class'])

# Read in traffic data
traffic_train = pd.concat( pd.read_table('../../data/season_1/training_data/traffic_data/traffic_data_2016-01-%s' %i, sep=' ', header=None, names = ['district_hash_orig', 'time'])
                        for i in dates)

split1 = pd.DataFrame(traffic_train.district_hash_orig.str.split('[\W+]',1).tolist())
traffic_train['district_hash'] = split1[0]
traffic_train['tj'] = split1[1]

split2 = pd.DataFrame(traffic_train.tj.str.split('(\d{4}\-\d{2}\-\d{2}$)',1).tolist())
traffic_train['tj_level'] = split2[0]
traffic_train['date'] = split2[1]

traffic_train['tj_time'] = traffic_train.apply(lambda r: str(r.date) + ' '+ str(r.time), axis=1)
traffic_train.drop(['district_hash_orig', 'time', 'tj', 'date'], axis=1, inplace=True)

# Read in weather data                                                                                                                                                            
weather_train = pd.concat( pd.read_table('../../data/season_1/training_data/weather_data/weather_data_2016-01-%s' %i, header=None, names = ['Time', 'Weather', 'temperature', 'PM2.5' ])
                        for i in dates)                      


# Review input features (ORDER, POI, TRAFFIC, WEATHER) for training set - Part 1
names = ['ORDER', 'POI', 'TRAFFIC', 'WEATHER']
features = [order_train, poi_train, traffic_train, weather_train]

for name, feature in zip(names, features):
    print "\n\n-----------------------"
    print "{} TRAIN INFORMATION" .format(str.upper(name))
    print "-----------------------"
    print "Shape of training set:", feature.shape, "\n\n"
    print "Column Headers:", list(feature.columns.values), "\n\n"
    print feature.dtypes, "\n\n"
    print feature.head(5), "\n\n"
    print feature.describe(), "\n\n"



-----------------------
ORDER TRAIN INFORMATION
-----------------------
Shape of training set: (8540614, 7) 


Column Headers: ['order_id', 'driver_id', 'passenger_id', 'start_district_hash', 'dest_district_hash', 'Price', 'Time'] 


order_id                object
driver_id               object
passenger_id            object
start_district_hash     object
dest_district_hash      object
Price                  float64
Time                    object
dtype: object 


                           order_id                         driver_id  \
0  97ebd0c6680f7c0535dbfdead6e51b4b  dd65fa250fca2833a3a8c16d2cf0457c   
1  92c3ac9251cc9b5aab90b114a1e363be  c077e0297639edcb1df6189e8cda2c3d   
2  abeefc3e2aec952468e2fd42a1649640  86dbc1b68de435957c61b5a523854b69   
3  cb31d0be64cda3cc66b46617bf49a05c  4fadfa6eeaa694742de036dddf02b0c4   
4  139d492189ae5a933122c098f63252b3                               NaN   

                       passenger_id               start_district_hash  \
0  ed180d7daf639d9

In [130]:
# Load test data
dates = ('22_test', '24_test', '26_test', '28_test', '30_test')

# Read in order data
order_test = pd.concat( pd.read_table('../../data/season_1/test_set_1/order_data/order_data_2016-01-%s' %i, header=None, names = ['order_id', 'driver_id', 'passenger_id', 'start_district_hash', 'dest_district_hash', 'Price', 'Time' ])
                      for i in dates)

# Read in poi data
poi_test = pd.read_table('../../data/season_1/test_set_1/poi_data/poi_data', sep=' ', header=None, names=['district_hash'])
poi_test = pd.DataFrame(poi_test.district_hash.str.split('\W+',1).tolist(), columns = ['district_hash','poi_class'])

# Read in traffic data
traffic_test = pd.concat( pd.read_table('../../data/season_1/test_set_1/traffic_data/traffic_data_2016-01-%s' %i, sep=' ', header=None, names = ['district_hash_orig', 'time'])
                        for i in dates)

split1 = pd.DataFrame(traffic_test.district_hash_orig.str.split('[\W+]',1).tolist())
traffic_test['district_hash'] = split1[0]
traffic_test['tj'] = split1[1]

split2 = pd.DataFrame(traffic_test.tj.str.split('(\d{4}\-\d{2}\-\d{2}$)',1).tolist())
traffic_test['tj_level'] = split2[0]
traffic_test['date'] = split2[1]

traffic_test['tj_time'] = traffic_test.apply(lambda r: str(r.date) + ' '+ str(r.time), axis=1)
traffic_test.drop(['district_hash_orig', 'time', 'tj', 'date'], axis=1, inplace=True)
                      
# Read in weather data    
weather_test = pd.concat( pd.read_table('../../data/season_1/test_set_1/weather_data/weather_data_2016-01-%s' %i, header=None, names = ['Time', 'Weather', 'temperature', 'PM2.5' ])
                        for i in dates)                      

# Review input features (ORDER, POI, TRAFFIC, WEATHER) for test set - Part 1
names = ['ORDER', 'POI', 'TRAFFIC', 'WEATHER']
features = [order_test, poi_test, traffic_test, weather_test]

for name, feature in zip(names, features):
    print "\n\n-----------------------"
    print "{} TEST INFORMATION" .format(name)
    print "-----------------------"
    print "Shape of test set:", feature.shape, "\n\n"
    print "Column Headers:", list(feature.columns.values), "\n\n"
    print feature.dtypes, "\n\n"
    print feature.head(5), "\n\n"
    print feature.describe(), "\n\n"



-----------------------
ORDER TEST INFORMATION
-----------------------
Shape of test set: (557985, 7) 


Column Headers: ['order_id', 'driver_id', 'passenger_id', 'start_district_hash', 'dest_district_hash', 'Price', 'Time'] 


order_id                object
driver_id               object
passenger_id            object
start_district_hash     object
dest_district_hash      object
Price                  float64
Time                    object
dtype: object 


                           order_id                         driver_id  \
0  e37f842c2a37de68e16466a3c56b916b                               NaN   
1  62588e55ff8892ba38a0bbe5678be272  53919c82b12bd39b12d77d4d8db1dda1   
2  693194e4d57cdd500e793c0c1e4f7a93  19f5ba02d33855688b727e39c98b2939   
3  3a0c651d2558d9083a66fc179e3ba81c                               NaN   
4  68b7cec210c7f875b79ce32dab7195ea                               NaN   

                       passenger_id               start_district_hash  \
0  5903295e07afb893e1a0f

In [133]:
# Review input features for train set - Part 2
missing_values = []
nonumeric_values = []

names = ['ORDER', 'POI', 'TRAFFIC', 'WEATHER']
features = [order_train, poi_train, traffic_train, weather_train]

print ("TRAIN SET INFORMATION")
print ("========================\n")

for name,feature in zip(names,features):
    
    print "\n-----------------------"
    print "{} TRAIN INFORMATION" .format(name)
    print "-----------------------\n"
    missing_values = []
    nonumeric_values = []
    
    for column in feature:
        
        # Find all the unique feature values
        uniq = feature[column].unique()
        print ("'{}' has {} unique values" .format(column,uniq.size))
        if (uniq.size > 25):
            print("~~Listing up to 25 unique values~~")
        print (uniq[0:24])
        print ("\n-----------------------------------------------------------------------\n")
            
        # Find features with missing values
        if (True in pd.isnull(uniq)):
            s = "{} has {} missing" .format(column, pd.isnull(feature[column]).sum())
            missing_values.append(s)
    
        # Find features with non-numeric values
        for i in range (1, np.prod(uniq.shape)):
            
            if (re.match('nan', str(uniq[i]))):
                break
            
            if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
                nonumeric_values.append(column)
                break
  
    print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
    print ("{} Features with missing values:\n{}\n\n" .format(name, missing_values))
    print ("{} Features with non-numeric values:\n{}" .format(name, nonumeric_values))
    print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

TRAIN SET INFORMATION


-----------------------
ORDER TRAIN INFORMATION
-----------------------

'order_id' has 8518049 unique values
~~Listing up to 25 unique values~~
['97ebd0c6680f7c0535dbfdead6e51b4b' '92c3ac9251cc9b5aab90b114a1e363be'
 'abeefc3e2aec952468e2fd42a1649640' 'cb31d0be64cda3cc66b46617bf49a05c'
 '139d492189ae5a933122c098f63252b3' 'b0b59fd0fe98bf603972da2f62e6522d'
 '17c1c85144ab532947c7ea724fdcc945' 'd682c1c004024f8937d21cd43498d1bb'
 '6fcae38baf2eb52e17273df41bf6fc6f' '70afc52be8a6d35137f2277a6ca88017'
 '29251671fcc9d9078b760ce3f6f7994b' 'ef10162b38fadcd75a5751bebe450f13'
 '23d8b4bd3443c32d5c08821943b9e779' '1b3b4b633578f88325973e084450af6f'
 '367da7c50e897ef8c11e5f103ca857b8' '1ec11c9499014c583995adfc1198da8d'
 '180470cf2f166e7251daaf8502fee301' 'c9a59b617813c106ea7c8809f5127811'
 'd0341ac9950c029e2ef6212628a94900' 'bf9fcd57aa98208c11de945513e26cce'
 '9a13aebfdf99847fd502815271e45a01' '0dd9ca15cbc2168e07ef61df4f9b6dda'
 '85bcc0e17b9a54454625a4ca1082f412' '7601a5eaee305

In [132]:
# Review input features for test set - Part 2
missing_values = []
nonumeric_values = []

names = ['ORDER', 'POI', 'TRAFFIC', 'WEATHER']
features = [order_test, poi_test, traffic_test, weather_test]

print ("TEST SET INFORMATION")
print ("========================\n")

for name,feature in zip(names,features):
    
    print "\n-----------------------"
    print "{} TEST INFORMATION" .format(name)
    print "-----------------------\n"
    missing_values = []
    nonumeric_values = []
    
    for column in feature:
        
        # Find all the unique feature values
        uniq = feature[column].unique()
        print ("'{}' has {} unique values" .format(column,uniq.size))
        if (uniq.size > 25):
            print("~~Listing up to 25 unique values~~")
        print (uniq[0:24])
        print ("\n-----------------------------------------------------------------------\n")
            
        # Find features with missing values
        if (True in pd.isnull(uniq)):
            s = "{} has {} missing" .format(column, pd.isnull(feature[column]).sum())
            missing_values.append(s)
    
        # Find features with non-numeric values
        for i in range (1, np.prod(uniq.shape)):
            
            if (re.match('nan', str(uniq[i]))):
                break
            
            if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
                nonumeric_values.append(column)
                break
  
    print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
    print ("{} Features with missing values:\n{}\n\n" .format(name, missing_values))
    print ("{} Features with non-numeric values:\n{}" .format(name, nonumeric_values))
    print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

TEST SET INFORMATION


-----------------------
ORDER TEST INFORMATION
-----------------------

'order_id' has 556729 unique values
~~Listing up to 25 unique values~~
['e37f842c2a37de68e16466a3c56b916b' '62588e55ff8892ba38a0bbe5678be272'
 '693194e4d57cdd500e793c0c1e4f7a93' '3a0c651d2558d9083a66fc179e3ba81c'
 '68b7cec210c7f875b79ce32dab7195ea' '6f34f29e68d317e3ca6dbd9ebeb24d5b'
 'e502c5addc7ccae08e6d8c2b5146cdb4' 'e56eeb16aaa407faab11732a5feda7fd'
 '6bd4f5447add2c3002656897f95c96d6' '9aefc579a9372ee293d584912b847cc5'
 '340a55eb95558104b496f562d865121b' 'dcddad7a414b0db2896d3571c9435dc8'
 '93f8022a1076295847597b1d1bcbbd78' 'fd52e9101d14c221ebc75aa79d39b673'
 '0593f172ee33c172142da384b567887b' 'feff7411b44fea2a6bd8da2ad4932803'
 '7feed561a986c5f579248cf818e96767' '4e6517efdcdad1ab5c6b016ca856639d'
 '4dd56b45a6f68af822da1be742704e90' 'ea6655f094e34868681747eb2bc1ce19'
 '0e9c8771c221ad21704551348ed2b1ef' '4c1cce63533b275e83b4e5d2b10e5899'
 'd3e0884b1d0eec7d702ea509c4f108ec' '2ebd0a61b632c919