In [126]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


#importing machine learning libraries
import tensorflow as tf
# from tensorflow.kears.models import In
from tensorflow.keras.layers import Dense, LSTM, Input
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


#Secondary imports
import pandas_profiling as pp

## Reading Datasets

In [99]:
train = pd.read_csv('train_file.csv')
test = pd.read_csv('test_file.csv')



In [100]:
sorted(train.columns)

['ACCOUNT NUMBER',
 'ADDRESS',
 'APPLICATION CREATED DATE',
 'APPLICATION REQUIREMENTS COMPLETE',
 'APPLICATION TYPE',
 'CITY',
 'CONDITIONAL APPROVAL',
 'DATE ISSUED',
 'DOING BUSINESS AS NAME',
 'ID',
 'LATITUDE',
 'LEGAL NAME',
 'LICENSE APPROVED FOR ISSUANCE',
 'LICENSE CODE',
 'LICENSE DESCRIPTION',
 'LICENSE ID',
 'LICENSE NUMBER',
 'LICENSE STATUS',
 'LICENSE STATUS CHANGE DATE',
 'LICENSE TERM EXPIRATION DATE',
 'LICENSE TERM START DATE',
 'LOCATION',
 'LONGITUDE',
 'PAYMENT DATE',
 'POLICE DISTRICT',
 'PRECINCT',
 'SITE NUMBER',
 'SSA',
 'STATE',
 'WARD',
 'WARD PRECINCT',
 'ZIP CODE']

In [101]:
#Finding duration of license, as it's probably an important factor

train['LICENSE_DURATION'] = list(map(lambda x, y: (x - y).days, pd.to_datetime(train['LICENSE TERM EXPIRATION DATE']), pd.to_datetime(train['LICENSE TERM START DATE'])))
test['LICENSE_DURATION'] = list(map(lambda x, y: (x - y).days, pd.to_datetime(test['LICENSE TERM EXPIRATION DATE']), pd.to_datetime(test['LICENSE TERM START DATE'])))

In [102]:
temp = []
base = list(map(lambda x: str(x),pd.to_datetime(train['LICENSE STATUS CHANGE DATE']).values ))


for i in range(len(base)):
    if base[i] !='NaT':
        temp.append(1)
    else:
        temp.append(0)
    
    
train['LICENSE_CHANGE'] = temp

temp = []
base = list(map(lambda x: str(x),pd.to_datetime(test['LICENSE STATUS CHANGE DATE']).values ))


for i in range(len(base)):
    if base[i] !='NaT':
        temp.append(1)
    else:
        temp.append(0)
    
    
test['LICENSE_CHANGE'] = temp

In [103]:
'''So the set of features I'll be using from the first analysis (mentioned in the approach file) will be, ['SITE_NUMBER', 'CITY', 'STATE', 'WARD', 
 'LICENSE_CODE', 'LICENSE_DESCRIPTION', 'LICENSE_TERM_START_DATE' - 'LICENSE_TERM_EXPIRATION_DATE' in days], 
Later I might use other variables as well'''


features = ['SITE NUMBER', 'CITY', 'STATE', 'LICENSE CODE','LICENSE DESCRIPTION', 'LICENSE_DURATION', 'LICENSE_CHANGE']

In [104]:
train['LICENSE DESCRIPTION'].shape

(85895,)

In [105]:
test['LICENSE DESCRIPTION'].shape

(57239,)

In [106]:
85895 + 57239

143134

In [107]:
lbl1 = LabelEncoder()
lbl2 = LabelEncoder()
lbl3 = LabelEncoder()
lbl4 = LabelEncoder()


lbl1.fit(pd.concat((train['LICENSE DESCRIPTION'], test['LICENSE DESCRIPTION']), axis=0))
train['LICENSE DESCRIPTION'] = lbl1.transform(train['LICENSE DESCRIPTION'])
test['LICENSE DESCRIPTION'] = lbl1.transform(test['LICENSE DESCRIPTION'])
                                              
lbl2.fit(pd.concat((train['CITY'], test['CITY']), axis=0))
train['CITY'] = lbl2.transform(train['CITY'])
test['CITY'] = lbl2.transform(test['CITY'])
                                              
                                              
lbl3.fit(pd.concat((train['STATE'], test['STATE']), axis=0))
train['STATE'] = lbl3.transform(train['STATE'])
test['STATE'] = lbl3.transform(test['STATE'])
                                              
lbl4.fit(train['LICENSE STATUS'])
train['LICENSE STATUS'] = lbl4.transform(train['LICENSE STATUS'])


In [113]:
trainx, testx, trainy, testy = train_test_split(train[features].values, train['LICENSE STATUS'].values, test_size = 0.2)

### Starting construction of models

In [148]:
# This returns a tensor
input_features = Input(shape=(7,))

# a layer instance is callable on a tensor, and returns a tensor
output_1 = Dense(10, activation='relu')(input_features)
output_2 = Dense(10, activation='relu')(output_1)
predictions = Dense(5, activation='softmax')(output_2)

# This creates a model that includes
# the Input layer and three Dense layers
model = Model(inputs=input_features, outputs=predictions)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_crossentropy'])

checkpoint_path = 'tmp/model_1.ckpt'
early_stopping = EarlyStopping(monitor='val_sparse_categorical_crossentropy',patience=7, verbose=1)
model_checkpoint = ModelCheckpoint(checkpoint_path,monitor='val_sparse_categorical_crossentropy',save_best_only=True, verbose=1)
print('Model Built', model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        (None, 7)                 0         
_________________________________________________________________
dense_36 (Dense)             (None, 10)                80        
_________________________________________________________________
dense_37 (Dense)             (None, 10)                110       
_________________________________________________________________
dense_38 (Dense)             (None, 5)                 55        
Total params: 245
Trainable params: 245
Non-trainable params: 0
_________________________________________________________________
Model Built None


In [149]:
if os.path.isdir("tmp"):
    pass
else:
    os.mkdir('tmp/')

In [150]:
model.fit(trainx, trainy,
          validation_data=(testx, testy)
          ,epochs = 10000, 
          batch_size = 5000,
          callbacks = [early_stopping, model_checkpoint]
         )

Train on 68716 samples, validate on 17179 samples
Epoch 1/10000
 5000/68716 [=>............................] - ETA: 1s - loss: 4.2649 - sparse_categorical_crossentropy: 4.2649
Epoch 00001: val_sparse_categorical_crossentropy improved from inf to 1.59199, saving model to tmp/model_1.ckpt
Epoch 2/10000
 5000/68716 [=>............................] - ETA: 0s - loss: 1.5921 - sparse_categorical_crossentropy: 1.5921
Epoch 00002: val_sparse_categorical_crossentropy improved from 1.59199 to 1.57191, saving model to tmp/model_1.ckpt
Epoch 3/10000
 5000/68716 [=>............................] - ETA: 0s - loss: 1.5717 - sparse_categorical_crossentropy: 1.5717
Epoch 00003: val_sparse_categorical_crossentropy improved from 1.57191 to 1.55138, saving model to tmp/model_1.ckpt
Epoch 4/10000
 5000/68716 [=>............................] - ETA: 0s - loss: 1.5514 - sparse_categorical_crossentropy: 1.5514
Epoch 00004: val_sparse_categorical_crossentropy improved from 1.55138 to 1.53045, saving model to tmp

<tensorflow.python.keras.callbacks.History at 0x7f1ea8735b38>

In [151]:
result = model.predict(test[features])

In [152]:
resultt = np.argmax(result, axis=1)

In [153]:
test.shape

(57239, 33)

In [154]:
submission = pd.DataFrame()
submission['ID'] = test['LICENSE ID']
submission['LICENSE STATUS'] = lbl4.inverse_transform(resultt)

In [155]:
submission.to_csv('submission.csv', index=False)

In [156]:
np.unique(resultt, return_counts=True)

(array([1]), array([57239]))

In [157]:
np.unique(train['LICENSE STATUS'].values, return_counts=True)

(array([0, 1, 2, 3, 4]), array([30200, 55400,     2,     3,   290]))