In [117]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb


#importing machine learning libraries
import tensorflow as tf
# from tensorflow.kears.models import In
from tensorflow.keras.layers import Dense, LSTM, Input
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import pickle



#Secondary imports
import pandas_profiling as pp

## Reading Datasets

In [3]:
train = pd.read_csv('train_file.csv')
test = pd.read_csv('test_file.csv')
sample = pd.read_csv('sample_submission.csv')



  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
sorted(train.columns)

['ACCOUNT NUMBER',
 'ADDRESS',
 'APPLICATION CREATED DATE',
 'APPLICATION REQUIREMENTS COMPLETE',
 'APPLICATION TYPE',
 'CITY',
 'CONDITIONAL APPROVAL',
 'DATE ISSUED',
 'DOING BUSINESS AS NAME',
 'ID',
 'LATITUDE',
 'LEGAL NAME',
 'LICENSE APPROVED FOR ISSUANCE',
 'LICENSE CODE',
 'LICENSE DESCRIPTION',
 'LICENSE ID',
 'LICENSE NUMBER',
 'LICENSE STATUS',
 'LICENSE STATUS CHANGE DATE',
 'LICENSE TERM EXPIRATION DATE',
 'LICENSE TERM START DATE',
 'LOCATION',
 'LONGITUDE',
 'PAYMENT DATE',
 'POLICE DISTRICT',
 'PRECINCT',
 'SITE NUMBER',
 'SSA',
 'STATE',
 'WARD',
 'WARD PRECINCT',
 'ZIP CODE']

## Performing feature processing as discussed in my_approach.pdf

In [5]:
'''So the set of features I'll be using from the first analysis (mentioned in the approach file) will be, ['SITE_NUMBER', 'CITY', 'STATE', 'WARD', 
 'LICENSE_CODE', 'LICENSE_DESCRIPTION', 'LICENSE_TERM_START_DATE' - 'LICENSE_TERM_EXPIRATION_DATE' in days], 
Later I might use other variables as well'''


features = ['SITE NUMBER', 'CITY', 'STATE', 'LICENSE CODE','LICENSE DESCRIPTION', 'LICENSE_DURATION', 'LICENSE_CHANGE']

In [6]:
#Finding duration of license, as it's probably an important factor

train['LICENSE_DURATION'] = list(map(lambda x, y: (x - y).days, pd.to_datetime(train['LICENSE TERM EXPIRATION DATE']), pd.to_datetime(train['LICENSE TERM START DATE'])))
test['LICENSE_DURATION'] = list(map(lambda x, y: (x - y).days, pd.to_datetime(test['LICENSE TERM EXPIRATION DATE']), pd.to_datetime(test['LICENSE TERM START DATE'])))

In [7]:
temp = []
base = list(map(lambda x: str(x),pd.to_datetime(train['LICENSE STATUS CHANGE DATE']).values ))


for i in range(len(base)):
    if base[i] !='NaT':
        temp.append(1)
    else:
        temp.append(0)
    
    
train['LICENSE_CHANGE'] = temp

temp = [] #If the license was changed, it's 1 else 0
base = list(map(lambda x: str(x),pd.to_datetime(test['LICENSE STATUS CHANGE DATE']).values ))


for i in range(len(base)):
    if base[i] !='NaT':
        temp.append(1)
    else:
        temp.append(0)
    
    
test['LICENSE_CHANGE'] = temp

In [8]:
lbl1 = LabelEncoder()
lbl2 = LabelEncoder()
lbl3 = LabelEncoder()
lbl4 = LabelEncoder()


lbl1.fit(pd.concat((train['LICENSE DESCRIPTION'], test['LICENSE DESCRIPTION']), axis=0))
train['LICENSE DESCRIPTION'] = lbl1.transform(train['LICENSE DESCRIPTION'])
test['LICENSE DESCRIPTION'] = lbl1.transform(test['LICENSE DESCRIPTION'])
                                              
lbl2.fit(pd.concat((train['CITY'], test['CITY']), axis=0))
train['CITY'] = lbl2.transform(train['CITY'])
test['CITY'] = lbl2.transform(test['CITY'])
                                              
                                              
lbl3.fit(pd.concat((train['STATE'], test['STATE']), axis=0))
train['STATE'] = lbl3.transform(train['STATE'])
test['STATE'] = lbl3.transform(test['STATE'])
                                              
lbl4.fit(train['LICENSE STATUS'])
train['LICENSE STATUS'] = lbl4.transform(train['LICENSE STATUS'])


## Checking if POLICE DISTRICT is important

In [9]:
print("This is when POLICE DISTRICT is not null",np.unique(train[~train['POLICE DISTRICT'].isna()]['LICENSE STATUS'].values, return_counts=True))
print("This is when POLICE DISTRICT is null", np.unique(train[train['POLICE DISTRICT'].isna()]['LICENSE STATUS'].values, return_counts=True))

This is when POLICE DISTRICT is not null (array([0, 1, 3, 4]), array([24830,  6768,     3,   282]))
This is when POLICE DISTRICT is null (array([0, 1, 2, 4]), array([ 5370, 48632,     2,     8]))


In [10]:
lbl4.inverse_transform([1])

array(['AAI'], dtype=object)

In [11]:
features.append('POLICE DISTRICT')

#### As we can see, the LICENSE STATUS of AAI was significantly higher when POLICE DISTRICT was null, next highest term is AAC which is higher when POLICE DISTRICT is not null

In [12]:
train['POLICE DISTRICT'].fillna(0, inplace=True)

## Checking if WARD and WARD PRECINCT is important

In [13]:
# Checking for WARD first
print("This is when WARD is not null",np.unique(train[~train['WARD'].isna()]['LICENSE STATUS'].values, return_counts=True))
print("This is when WARD is null", np.unique(train[train['WARD'].isna()]['LICENSE STATUS'].values, return_counts=True))
print()
# Checking for WARD  PRECINCT first
print("This is when WARD PRECINCT is not null",np.unique(train[~train['WARD PRECINCT'].isna()]['LICENSE STATUS'].values, return_counts=True))
print("This is when WARD PRECINCT is null", np.unique(train[train['WARD PRECINCT'].isna()]['LICENSE STATUS'].values, return_counts=True))

This is when WARD is not null (array([0, 1, 3, 4]), array([25808, 10094,     3,   289]))
This is when WARD is null (array([0, 1, 2, 4]), array([ 4392, 45306,     2,     1]))

This is when WARD PRECINCT is not null (array([0, 1, 3, 4]), array([25809, 10094,     3,   289]))
This is when WARD PRECINCT is null (array([0, 1, 2, 4]), array([ 4391, 45306,     2,     1]))


### They both show very similar effect on LICENSE STATUS, and the results are also very similar to POLICE DISTRICT (as observed in correlation matrix), so not taking these features

## Checking if LATITUDE, LONGITUDE and LOCATION is important

In [147]:
# Checking for LATITUDE first
print("This is when LATITUDE is not null",np.unique(train[~train['LATITUDE'].isna()]['LICENSE STATUS'].values, return_counts=True))
print("This is when LATITUDE is null", np.unique(train[train['LATITUDE'].isna()]['LICENSE STATUS'].values, return_counts=True))
print()
# Checking for LONGITUDE first
print("This is when LONGITUDE is not null",np.unique(train[~train['LONGITUDE'].isna()]['LICENSE STATUS'].values, return_counts=True))
print("This is when LONGITUDE is null", np.unique(train[train['LONGITUDE'].isna()]['LICENSE STATUS'].values, return_counts=True))

print()
# Checking for LOCATION first
print("This is when LOCATION is not null",np.unique(train[~train['LOCATION'].isna()]['LICENSE STATUS'].values, return_counts=True))
print("This is when LOCATION is null", np.unique(train[train['LOCATION'].isna()]['LICENSE STATUS'].values, return_counts=True))

This is when LATITUDE is not null (array([0, 1, 3, 4]), array([25968, 12390,     3,   288]))
This is when LATITUDE is null (array([0, 1, 2, 4]), array([ 4232, 43010,     2,     2]))

This is when LONGITUDE is not null (array([0, 1, 3, 4]), array([25968, 12390,     3,   288]))
This is when LONGITUDE is null (array([0, 1, 2, 4]), array([ 4232, 43010,     2,     2]))

This is when LOCATION is not null (array([0, 1, 3, 4]), array([25968, 12390,     3,   288]))
This is when LOCATION is null (array([0, 1, 2, 4]), array([ 4232, 43010,     2,     2]))


### These three features show very similar effect on LICENSE STATUS, and the results are also very similar to POLICE DISTRICT (as observed in correlation matrix), so not taking these features

## Checking if DOING BUSINESS AS NAME is important

In [14]:
# Checking for DOING BUSINESS AS NAME first
print("This is when DOING BUSINESS AS NAME is not null",np.unique(train[~train['DOING BUSINESS AS NAME'].isna()]['LICENSE STATUS'].values, return_counts=True))
print("This is when DOING BUSINESS AS NAME is null", np.unique(train[train['DOING BUSINESS AS NAME'].isna()]['LICENSE STATUS'].values, return_counts=True))
print()


This is when DOING BUSINESS AS NAME is not null (array([0, 1, 2, 3, 4]), array([30199, 55400,     2,     3,   290]))
This is when DOING BUSINESS AS NAME is null (array([0]), array([1]))



### We can add this to our feature set, but it has many unique values, and considering the size of dataset, it's not recommended to use label encoder for 50k unique values, with dataset of size 80k, so we're ignoring it. 

### Using number of days between DATE ISSUED and LICENSE START DATE as a feature, as it might have some latent information

In [120]:
train['INTERVAL'] = list(map(lambda x, y: (x - y).days, pd.to_datetime(train['DATE ISSUED']), pd.to_datetime(train['LICENSE TERM START DATE'])))
test['INTERVAL'] =  list(map(lambda x, y: (x - y).days, pd.to_datetime(test['DATE ISSUED']), pd.to_datetime(test['LICENSE TERM START DATE'])))

In [122]:
train['INTERVAL'].values[:10]

#As we can see, there are some negative values. Which indicates some of the licenses were valid before they were offically issued, which might prove useful
#depending on domain information. 

array([1334.,  208.,    0.,    0.,   20.,    0.,  365.,   -9.,    0.,
        -23.])

In [126]:
features.append('INTERVAL')
features

['SITE NUMBER',
 'CITY',
 'STATE',
 'LICENSE CODE',
 'LICENSE DESCRIPTION',
 'LICENSE_DURATION',
 'LICENSE_CHANGE',
 'POLICE DISTRICT',
 'INTERVAL',
 'INTERVAL']

# Finally creating training and testing dataset

In [140]:
trainx, testx, trainy, testy = train_test_split(train[features].values, train['LICENSE STATUS'].values, test_size = 0.1)


### Starting construction of models

In [141]:
lgbb = lgb.LGBMClassifier()
lgbb.fit(trainx, trainy)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [148]:
with open('tmp/lgb.pkl','wb+') as handle:
    pickle.dump(lgbb, handle)
    
with open('tmp/lbl1.pkl','wb+') as handle:
    pickle.dump(lbl1, handle)
with open('tmp/lbl2.pkl','wb+') as handle:
    pickle.dump(lbl2, handle)
with open('tmp/lbl3.pkl','wb+') as handle:
    pickle.dump(lbl3, handle)
with open('tmp/lbl4.pkl','wb+') as handle:
    pickle.dump(lbl4, handle)

### From LIGHTGBM

In [143]:
result = lgbb.predict(test[features])

np.unique(result, return_counts=True)

submission = pd.DataFrame()
submission['ID'] = test['ID']
submission['LICENSE STATUS'] = lbl4.inverse_transform(result)

submission.head()

submission.to_csv('submission.csv', index=False)