In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb

import os
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
orig_train = pd.read_csv('train.csv')
orig_test = pd.read_csv('test.csv')

In [9]:
orig_train.columns

Index(['RowId', 'IntersectionId', 'Latitude', 'Longitude', 'EntryStreetName',
       'ExitStreetName', 'EntryHeading', 'ExitHeading', 'Hour', 'Weekend',
       'Month', 'Path', 'TotalTimeStopped_p20', 'TotalTimeStopped_p40',
       'TotalTimeStopped_p50', 'TotalTimeStopped_p60', 'TotalTimeStopped_p80',
       'TimeFromFirstStop_p20', 'TimeFromFirstStop_p40',
       'TimeFromFirstStop_p50', 'TimeFromFirstStop_p60',
       'TimeFromFirstStop_p80', 'DistanceToFirstStop_p20',
       'DistanceToFirstStop_p40', 'DistanceToFirstStop_p50',
       'DistanceToFirstStop_p60', 'DistanceToFirstStop_p80', 'City'],
      dtype='object')

In [11]:
orig_train[['IntersectionId', 'Latitude', 'Longitude', 'EntryHeading', 'ExitHeading', 'Hour', 'EntryStreetName', 'ExitStreetName', 'DistanceToFirstStop_p80']].drop_duplicates(['IntersectionId'])

Unnamed: 0,IntersectionId,Latitude,Longitude,EntryHeading,ExitHeading,Hour,EntryStreetName,ExitStreetName,DistanceToFirstStop_p80
0,0,33.79166,-84.43003,NW,NW,0,Marietta Boulevard Northwest,Marietta Boulevard Northwest,0.0
654,1,33.75094,-84.39303,NE,NE,0,Peachtree Street Southwest,Peachtree Street Southwest,58.2
1011,2,33.74014,-84.31389,E,E,1,Glenwood Avenue Southeast,Glenwood Avenue Southeast,62.2
1337,4,33.81767,-84.36696,N,N,0,Piedmont Road Northeast,Piedmont Road Northeast,0.0
2882,5,33.77124,-84.38884,E,E,1,North Avenue Northwest,North Avenue Northwest,139.3
3651,6,33.75712,-84.38424,S,S,0,Courtland Street Northeast,Courtland Street Northeast,0.0
3847,9,33.81100,-84.41280,N,NW,7,Howell Mill Road Northwest,Howell Mill Road Northwest,66.5
4029,10,33.65287,-84.39762,W,W,6,Porsche Avenue,Porsche Avenue,0.0
4187,11,33.74027,-84.34924,N,N,16,Moreland Avenue Southeast,Moreland Avenue Southeast,2842.3
4684,13,33.78177,-84.36868,E,S,2,10th Street Northeast,Monroe Drive Northeast,28.0


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

real_features = ['Longitude', 'Latitude', 'Hour', 'Month']
categorical_standard_features = ['Weekend', 'Month', 'City']
categorical_encoding_features = ['EntryStreetName', 'ExitStreetName', 'EntryHeading', 'ExitHeading', 'City']
all_features = categorical_standard_features + categorical_encoding_features + real_features

train = train.fillna(dict(zip(categorical_encoding_features, ['NULL'] * len(categorical_encoding_features))))
test = test.fillna(dict(zip(categorical_encoding_features, ['NULL'] * len(categorical_encoding_features))))

In [4]:
train_entry_street_names = train['EntryStreetName'].unique()
train_exit_street_names = train['ExitStreetName'].unique()

subtest = test[test['EntryStreetName'].isin(train_entry_street_names) & test['ExitStreetName'].isin(train['ExitStreetName'])]

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
 
for column_name in categorical_encoding_features:
    print(column_name)
    le.fit(train[column_name])
    feature_classes = list(le.classes_)
    train[column_name] = pd.DataFrame(le.transform(train[column_name]))
    subtest[column_name] = pd.DataFrame(le.transform(subtest[column_name].astype(str)))

EntryStreetName


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


ExitStreetName
EntryHeading
ExitHeading
City


In [6]:
from sklearn.model_selection import train_test_split

train_set, valid_set = train_test_split(train, test_size=0.1)

In [1]:
orig_train[1000:10000]

NameError: name 'orig_train' is not defined

In [20]:
y_train = train_set['TotalTimeStopped_p20'].values
y_valid = valid_set['TotalTimeStopped_p20'].values

x_train = train_set[all_features]
x_valid = valid_set[all_features]

lgb_train = lgb.Dataset(x_train, y_train,
                        free_raw_data=False,
 feature_name = all_features,
 categorical_feature = categorical_standard_features + categorical_encoding_features
)
lgb_valid = lgb.Dataset(x_valid, y_valid,
                        free_raw_data=False,
                        feature_name = all_features,
 categorical_feature = categorical_standard_features + categorical_encoding_features)

In [22]:
params = {
 'task': 'train',
 'boosting_type': 'gbdt',
 'objective': 'regression',
 'metric': 'rmse',
 'min_data': 1,
 'verbose': -1,
    'learning_rate': 0.1,
    'num_leaves': 100,
    'num_iterations': 1000,
    'max_depth':10
}
 
gbm = lgb.train(params, lgb_train, 
                valid_sets = lgb_valid)



[1]	valid_0's rmse: 6.93064
[2]	valid_0's rmse: 6.85804
[3]	valid_0's rmse: 6.7944
[4]	valid_0's rmse: 6.73976
[5]	valid_0's rmse: 6.68742
[6]	valid_0's rmse: 6.64102
[7]	valid_0's rmse: 6.60209
[8]	valid_0's rmse: 6.56687
[9]	valid_0's rmse: 6.5398
[10]	valid_0's rmse: 6.51428
[11]	valid_0's rmse: 6.48567
[12]	valid_0's rmse: 6.4599
[13]	valid_0's rmse: 6.43587
[14]	valid_0's rmse: 6.41437
[15]	valid_0's rmse: 6.39436
[16]	valid_0's rmse: 6.37278
[17]	valid_0's rmse: 6.35748
[18]	valid_0's rmse: 6.34174
[19]	valid_0's rmse: 6.32615
[20]	valid_0's rmse: 6.31516
[21]	valid_0's rmse: 6.30046
[22]	valid_0's rmse: 6.28815
[23]	valid_0's rmse: 6.27313
[24]	valid_0's rmse: 6.26
[25]	valid_0's rmse: 6.25382
[26]	valid_0's rmse: 6.2432
[27]	valid_0's rmse: 6.23205
[28]	valid_0's rmse: 6.22771
[29]	valid_0's rmse: 6.21653
[30]	valid_0's rmse: 6.20437
[31]	valid_0's rmse: 6.19909
[32]	valid_0's rmse: 6.19294
[33]	valid_0's rmse: 6.1904
[34]	valid_0's rmse: 6.17733
[35]	valid_0's rmse: 6.17196
[3

[288]	valid_0's rmse: 5.74018
[289]	valid_0's rmse: 5.74019
[290]	valid_0's rmse: 5.73994
[291]	valid_0's rmse: 5.7398
[292]	valid_0's rmse: 5.73981
[293]	valid_0's rmse: 5.73944
[294]	valid_0's rmse: 5.73893
[295]	valid_0's rmse: 5.73865
[296]	valid_0's rmse: 5.7396
[297]	valid_0's rmse: 5.73918
[298]	valid_0's rmse: 5.73904
[299]	valid_0's rmse: 5.73839
[300]	valid_0's rmse: 5.73771
[301]	valid_0's rmse: 5.73747
[302]	valid_0's rmse: 5.7375
[303]	valid_0's rmse: 5.73697
[304]	valid_0's rmse: 5.73646
[305]	valid_0's rmse: 5.73595
[306]	valid_0's rmse: 5.73198
[307]	valid_0's rmse: 5.73131
[308]	valid_0's rmse: 5.73113
[309]	valid_0's rmse: 5.73115
[310]	valid_0's rmse: 5.73107
[311]	valid_0's rmse: 5.73077
[312]	valid_0's rmse: 5.73067
[313]	valid_0's rmse: 5.72976
[314]	valid_0's rmse: 5.72973
[315]	valid_0's rmse: 5.72947
[316]	valid_0's rmse: 5.72947
[317]	valid_0's rmse: 5.72973
[318]	valid_0's rmse: 5.72936
[319]	valid_0's rmse: 5.72938
[320]	valid_0's rmse: 5.72862
[321]	valid_0

[565]	valid_0's rmse: 5.68713
[566]	valid_0's rmse: 5.6866
[567]	valid_0's rmse: 5.68674
[568]	valid_0's rmse: 5.68669
[569]	valid_0's rmse: 5.68673
[570]	valid_0's rmse: 5.68667
[571]	valid_0's rmse: 5.68685
[572]	valid_0's rmse: 5.68618
[573]	valid_0's rmse: 5.6861
[574]	valid_0's rmse: 5.68606
[575]	valid_0's rmse: 5.68584
[576]	valid_0's rmse: 5.68555
[577]	valid_0's rmse: 5.68568
[578]	valid_0's rmse: 5.68563
[579]	valid_0's rmse: 5.68563
[580]	valid_0's rmse: 5.68562
[581]	valid_0's rmse: 5.68584
[582]	valid_0's rmse: 5.68599
[583]	valid_0's rmse: 5.68604
[584]	valid_0's rmse: 5.68597
[585]	valid_0's rmse: 5.68643
[586]	valid_0's rmse: 5.68663
[587]	valid_0's rmse: 5.68676
[588]	valid_0's rmse: 5.68687
[589]	valid_0's rmse: 5.68675
[590]	valid_0's rmse: 5.68684
[591]	valid_0's rmse: 5.68664
[592]	valid_0's rmse: 5.68681
[593]	valid_0's rmse: 5.6867
[594]	valid_0's rmse: 5.68663
[595]	valid_0's rmse: 5.68649
[596]	valid_0's rmse: 5.68618
[597]	valid_0's rmse: 5.68614
[598]	valid_0

[849]	valid_0's rmse: 5.65454
[850]	valid_0's rmse: 5.65469
[851]	valid_0's rmse: 5.65466
[852]	valid_0's rmse: 5.65496
[853]	valid_0's rmse: 5.65453
[854]	valid_0's rmse: 5.65454
[855]	valid_0's rmse: 5.65466
[856]	valid_0's rmse: 5.65457
[857]	valid_0's rmse: 5.65457
[858]	valid_0's rmse: 5.65301
[859]	valid_0's rmse: 5.65295
[860]	valid_0's rmse: 5.65291
[861]	valid_0's rmse: 5.65287
[862]	valid_0's rmse: 5.65315
[863]	valid_0's rmse: 5.65429
[864]	valid_0's rmse: 5.65348
[865]	valid_0's rmse: 5.65465
[866]	valid_0's rmse: 5.65474
[867]	valid_0's rmse: 5.65468
[868]	valid_0's rmse: 5.65469
[869]	valid_0's rmse: 5.65457
[870]	valid_0's rmse: 5.65459
[871]	valid_0's rmse: 5.65445
[872]	valid_0's rmse: 5.65422
[873]	valid_0's rmse: 5.65407
[874]	valid_0's rmse: 5.65426
[875]	valid_0's rmse: 5.65407
[876]	valid_0's rmse: 5.65378
[877]	valid_0's rmse: 5.65375
[878]	valid_0's rmse: 5.65307
[879]	valid_0's rmse: 5.65294
[880]	valid_0's rmse: 5.6524
[881]	valid_0's rmse: 5.65183
[882]	valid