# Business License Status Prediction

## Our goal is to perform a multi-class classification of the business license status of various users, from the given dataset.

### Importing libraries

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections

# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import NearMiss
# from imblearn.metrics import classification_report_imbalanced
# from collections import Counter
# from sklearn.model_selection import KFold, StratifiedKFold

import random
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Importing the datasets

In [None]:
# Importing the dataset
train_data = pd.read_csv("train_file.csv")
train_data.head()

In [None]:
train_data.shape

In [None]:
# Importing the dataset
test_data = pd.read_csv("test_file.csv")
test_data.head()

In [None]:
test_data.shape

In [None]:
# Check for any other unusable (NaN) values
print(pd.isnull(train_data).sum())

In [None]:
# Check for any other unusable (NaN) values
print(pd.isnull(test_data).sum())

In [None]:
my_submission = pd.DataFrame({'ID': test_data.ID})
my_submission.head()

In [None]:
my_submission['ID'].count()

In [None]:
my_submission.shape

In [None]:
train_data.columns

In [None]:
# Studying the target variable
train_data['LICENSE STATUS'].value_counts()

In [None]:
plt.title('Distribution of target variable')
sns.countplot(train_data['LICENSE STATUS'])
plt.show()

In [None]:
# Identifying the unique number of values in the dataset
train_data.nunique()

## Data Preprocessing

In [None]:
# Dropping columns which will be of no use for our model

train_data = train_data.drop(['LICENSE ID'], axis = 1)
test_data = test_data.drop(['LICENSE ID'], axis = 1)

train_data = train_data.drop(['ID'], axis = 1)
test_data = test_data.drop(['ID'], axis = 1)

train_data = train_data.drop(['ADDRESS'], axis = 1)
test_data = test_data.drop(['ADDRESS'], axis = 1)

train_data = train_data.drop(['STATE'], axis = 1)
test_data = test_data.drop(['STATE'], axis = 1)

train_data = train_data.drop(['LICENSE NUMBER'], axis = 1)
test_data = test_data.drop(['LICENSE NUMBER'], axis = 1)

train_data = train_data.drop(['APPLICATION CREATED DATE'], axis = 1)
test_data = test_data.drop(['APPLICATION CREATED DATE'], axis = 1)

train_data = train_data.drop(['LOCATION'], axis = 1)
test_data = test_data.drop(['LOCATION'], axis = 1)

train_data = train_data.drop(['LICENSE STATUS CHANGE DATE'], axis = 1)
test_data = test_data.drop(['LICENSE STATUS CHANGE DATE'], axis = 1)

train_data = train_data.drop(['WARD PRECINCT'], axis = 1)
test_data = test_data.drop(['WARD PRECINCT'], axis = 1)

train_data = train_data.drop(['LATITUDE'], axis = 1)
test_data = test_data.drop(['LATITUDE'], axis = 1)

train_data = train_data.drop(['LONGITUDE'], axis = 1)
test_data = test_data.drop(['LONGITUDE'], axis = 1)

In [None]:
import missingno as msno
# Missing data in form of white lines 
msno.matrix(train_data)
plt.show()

In [None]:
train_data.isna().sum()

In [None]:
train_data.shape

In [None]:
train_data = train_data.dropna(thresh = 16)

In [None]:
train_data['LICENSE STATUS'].value_counts()

In [None]:
train_data['PAYMENT DATE'] = pd.to_datetime(train_data['PAYMENT DATE'])
train_data['APPLICATION REQUIREMENTS COMPLETE'] = pd.to_datetime(train_data['APPLICATION REQUIREMENTS COMPLETE'])
train_data['LICENSE TERM EXPIRATION DATE'] = pd.to_datetime(train_data['LICENSE TERM EXPIRATION DATE'])
train_data['LICENSE TERM START DATE'] = pd.to_datetime(train_data['LICENSE TERM START DATE'])
train_data['DATE ISSUED'] = pd.to_datetime(train_data['DATE ISSUED'])
train_data['LICENSE APPROVED FOR ISSUANCE'] = pd.to_datetime(train_data['LICENSE APPROVED FOR ISSUANCE'])

In [None]:
test_data['PAYMENT DATE'] = pd.to_datetime(test_data['PAYMENT DATE'])
test_data['APPLICATION REQUIREMENTS COMPLETE'] = pd.to_datetime(test_data['APPLICATION REQUIREMENTS COMPLETE'])
test_data['LICENSE TERM EXPIRATION DATE'] = pd.to_datetime(test_data['LICENSE TERM EXPIRATION DATE'])
test_data['LICENSE TERM START DATE'] = pd.to_datetime(test_data['LICENSE TERM START DATE'])
test_data['DATE ISSUED'] = pd.to_datetime(test_data['DATE ISSUED'])
test_data['LICENSE APPROVED FOR ISSUANCE'] = pd.to_datetime(test_data['LICENSE APPROVED FOR ISSUANCE'])

In [None]:
train_data['PAYMENT YEAR'] = train_data['PAYMENT DATE'].apply(lambda x: x.year)

In [None]:
test_data['PAYMENT YEAR'] = test_data['PAYMENT DATE'].apply(lambda x: x.year)

In [None]:
train_data = train_data.drop(['PAYMENT DATE'], axis = 1)

In [None]:
test_data = test_data.drop(['PAYMENT DATE'], axis = 1)

In [None]:
train_data['APPLICATION REQUIREMENTS COMPLETE YEAR'] = train_data['APPLICATION REQUIREMENTS COMPLETE'].apply(lambda x: x.year)

In [None]:
test_data['APPLICATION REQUIREMENTS COMPLETE YEAR'] = test_data['APPLICATION REQUIREMENTS COMPLETE'].apply(lambda x: x.year)

In [None]:
train_data = train_data.drop(['APPLICATION REQUIREMENTS COMPLETE'], axis = 1)

In [None]:
test_data = test_data.drop(['APPLICATION REQUIREMENTS COMPLETE'], axis = 1)

In [None]:
train_data['DATE ISSUED YEAR'] = train_data['DATE ISSUED'].apply(lambda x: x.year)

In [None]:
test_data['DATE ISSUED YEAR'] = test_data['DATE ISSUED'].apply(lambda x: x.year)

In [None]:
train_data = train_data.drop(['DATE ISSUED'], axis = 1)

In [None]:
test_data = test_data.drop(['DATE ISSUED'], axis = 1)

In [None]:
train_data['LICENSE TERM EXPIRATION YEAR'] = train_data['LICENSE TERM EXPIRATION DATE'].apply(lambda x: x.year)
train_data['LICENSE TERM START YEAR'] = train_data['LICENSE TERM START DATE'].apply(lambda x: x.year)

In [None]:
test_data['LICENSE TERM EXPIRATION YEAR'] = test_data['LICENSE TERM EXPIRATION DATE'].apply(lambda x: x.year)
test_data['LICENSE TERM START YEAR'] = test_data['LICENSE TERM START DATE'].apply(lambda x: x.year)

In [None]:
train_data['LICENSE TERM EXPIRATION YEAR'] = train_data['LICENSE TERM EXPIRATION YEAR'].fillna(round(train_data['LICENSE TERM EXPIRATION YEAR'].mean()))

In [None]:
test_data['LICENSE TERM EXPIRATION YEAR'] = test_data['LICENSE TERM EXPIRATION YEAR'].fillna(round(test_data['LICENSE TERM EXPIRATION YEAR'].mean()))

In [None]:
train_data['LICENSE TERM START YEAR'] = train_data['LICENSE TERM START YEAR'].fillna(round(train_data['LICENSE TERM START YEAR'].mean()))

In [None]:
test_data['LICENSE TERM START YEAR'] = test_data['LICENSE TERM START YEAR'].fillna(round(test_data['LICENSE TERM START YEAR'].mean()))

In [None]:
train_data['LICENSE VALIDITY'] = abs(train_data['LICENSE TERM EXPIRATION YEAR'] - train_data['LICENSE TERM START YEAR'])

In [None]:
test_data['LICENSE VALIDITY'] = abs(test_data['LICENSE TERM EXPIRATION YEAR'] - test_data['LICENSE TERM START YEAR'])

In [None]:
train_data = train_data.drop(['LICENSE TERM EXPIRATION DATE'], axis = 1)
train_data = train_data.drop(['LICENSE TERM EXPIRATION YEAR'], axis = 1)

In [None]:
test_data = test_data.drop(['LICENSE TERM EXPIRATION DATE'], axis = 1)
test_data = test_data.drop(['LICENSE TERM EXPIRATION YEAR'], axis = 1)

In [None]:
train_data = train_data.drop(['LICENSE TERM START DATE'], axis = 1)
train_data = train_data.drop(['LICENSE TERM START YEAR'], axis = 1)

In [None]:
test_data = test_data.drop(['LICENSE TERM START DATE'], axis = 1)
test_data = test_data.drop(['LICENSE TERM START YEAR'], axis = 1)

In [None]:
train_data['LICENSE APPROVED YEAR'] = train_data['LICENSE APPROVED FOR ISSUANCE'].apply(lambda x: x.year)

In [None]:
test_data['LICENSE APPROVED YEAR'] = test_data['LICENSE APPROVED FOR ISSUANCE'].apply(lambda x: x.year)

In [None]:
train_data = train_data.drop(['LICENSE APPROVED FOR ISSUANCE'], axis = 1)

In [None]:
test_data = test_data.drop(['LICENSE APPROVED FOR ISSUANCE'], axis = 1)

In [None]:
train_data.head()

In [None]:
train_data['LEGAL NAME'] = train_data['LEGAL NAME'].fillna('None')
train_data['DOING BUSINESS AS NAME'] = train_data['DOING BUSINESS AS NAME'].fillna('None')

In [None]:
test_data['LEGAL NAME'] = test_data['LEGAL NAME'].fillna('None')
test_data['DOING BUSINESS AS NAME'] = test_data['DOING BUSINESS AS NAME'].fillna('None')

In [None]:
train_data['LEGAL NAME'].apply(lambda x: x.upper())
train_data['DOING BUSINESS AS NAME'].apply(lambda x: x.upper())
train_data.head()

In [None]:
test_data['LEGAL NAME'].apply(lambda x: x.upper())
test_data['DOING BUSINESS AS NAME'].apply(lambda x: x.upper())
test_data.head()

In [None]:
train_data['LEGAL NAME'] = train_data['LEGAL NAME'].str.replace('.', '', regex=False)
train_data['DOING BUSINESS AS NAME'] = train_data['DOING BUSINESS AS NAME'].str.replace('.', '', regex=False)

In [None]:
test_data['LEGAL NAME'] = test_data['LEGAL NAME'].str.replace('.', '', regex=False)
test_data['DOING BUSINESS AS NAME'] = test_data['DOING BUSINESS AS NAME'].str.replace('.', '', regex=False)

In [None]:
train_data['LEGAL TYPE'] = 'PVT'
train_data.head()

In [None]:
test_data['LEGAL TYPE'] = 'PVT'
test_data.head()

In [None]:
train_data['LEGAL TYPE'] = np.where(train_data['LEGAL NAME'].str.contains('INC'), 'INC', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['LEGAL NAME'].str.contains('INCORPORATED'), 'INC', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['DOING BUSINESS AS NAME'].str.contains('INC'), 'INC', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['DOING BUSINESS AS NAME'].str.contains('INCORPORATED'), 'INC', train_data['LEGAL TYPE'])
train_data.head()

In [None]:
test_data['LEGAL TYPE'] = np.where(test_data['LEGAL NAME'].str.contains('INC'), 'INC', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['LEGAL NAME'].str.contains('INCORPORATED'), 'INC', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['DOING BUSINESS AS NAME'].str.contains('INC'), 'INC', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['DOING BUSINESS AS NAME'].str.contains('INCORPORATED'), 'INC', test_data['LEGAL TYPE'])
test_data.head()

In [None]:
train_data['LEGAL TYPE'] = np.where(train_data['LEGAL NAME'].str.contains('LLC'), 'LLC', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['DOING BUSINESS AS NAME'].str.contains('LLC'), 'LLC', train_data['LEGAL TYPE'])
train_data.head()

In [None]:
test_data['LEGAL TYPE'] = np.where(test_data['LEGAL NAME'].str.contains('LLC'), 'LLC', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['DOING BUSINESS AS NAME'].str.contains('LLC'), 'LLC', test_data['LEGAL TYPE'])
test_data.head()

In [None]:
train_data['LEGAL TYPE'] = np.where(train_data['LEGAL NAME'].str.contains('CO'), 'CORP', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['LEGAL NAME'].str.contains('CORP'), 'CORP', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['LEGAL NAME'].str.contains('CORPORATION'), 'CORP', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['DOING BUSINESS AS NAME'].str.contains('CO'), 'CORP', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['DOING BUSINESS AS NAME'].str.contains('CORP'), 'CORP', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['DOING BUSINESS AS NAME'].str.contains('CORPORATION'), 'CORP', train_data['LEGAL TYPE'])
train_data.head()

In [None]:
test_data['LEGAL TYPE'] = np.where(test_data['LEGAL NAME'].str.contains('CO'), 'CORP', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['LEGAL NAME'].str.contains('CORP'), 'CORP', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['LEGAL NAME'].str.contains('CORPORATION'), 'CORP', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['DOING BUSINESS AS NAME'].str.contains('CO'), 'CORP', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['DOING BUSINESS AS NAME'].str.contains('CORP'), 'CORP', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['DOING BUSINESS AS NAME'].str.contains('CORPORATION'), 'CORP', test_data['LEGAL TYPE'])
test_data.head()

In [None]:
train_data['LEGAL TYPE'] = np.where(train_data['LEGAL NAME'].str.contains('LTD'), 'LTD', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['LEGAL NAME'].str.contains('LIMITED'), 'LTD', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['DOING BUSINESS AS NAME'].str.contains('LTD'), 'LTD', train_data['LEGAL TYPE'])
train_data['LEGAL TYPE'] = np.where(train_data['DOING BUSINESS AS NAME'].str.contains('LIMITED'), 'LTD', train_data['LEGAL TYPE'])
train_data.head()

In [None]:
test_data['LEGAL TYPE'] = np.where(test_data['LEGAL NAME'].str.contains('LTD'), 'LTD', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['LEGAL NAME'].str.contains('LIMITED'), 'LTD', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['DOING BUSINESS AS NAME'].str.contains('LTD'), 'LTD', test_data['LEGAL TYPE'])
test_data['LEGAL TYPE'] = np.where(test_data['DOING BUSINESS AS NAME'].str.contains('LIMITED'), 'LTD', test_data['LEGAL TYPE'])
test_data.head()

In [None]:
train_data['LEGAL TYPE'].value_counts()

In [None]:
train_data = train_data.drop(['LEGAL NAME'], axis = 1)
train_data = train_data.drop(['DOING BUSINESS AS NAME'], axis = 1)

In [None]:
test_data = test_data.drop(['LEGAL NAME'], axis = 1)
test_data = test_data.drop(['DOING BUSINESS AS NAME'], axis = 1)

In [None]:
train_data.head()

In [None]:
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Motor Vehicle Repair : Engine Only (Class II)', 'Motor Vehicle Repair')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Motor Vehicle Repair: Engine/Body(Class III)', 'Motor Vehicle Repair')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Motor Vehicle Repair; Specialty(Class I)', 'Motor Vehicle Repair')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Day Care Center Under 2 Years', 'Day Care Center')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Day Care Center 2 - 6 Years', 'Day Care Center')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Day Care Center Under 2 and 2 - 6 Years', 'Day Care Center')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Peddler, non-food', 'Peddler')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Peddler, non-food, special', 'Peddler')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Peddler, food (fruits and vegtables only)', 'Peddler')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Peddler,food - (fruits and vegetables only) - special', 'Peddler')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Tire Facilty Class I (100 - 1,000 Tires)', 'Tire Facilty')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Tire Facility Class II (1,001 - 5,000 Tires)', 'Tire Facilty')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Tire Facility Class III (5,001 - More Tires)', 'Tire Facilty')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Repossessor Class A', 'Repossessor')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Repossessor Class B', 'Repossessor')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Repossessor Class B Employee', 'Repossessor')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Expediter - Class B', 'Expediter')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Expediter - Class A', 'Expediter')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Expediter - Class B Employee', 'Expediter')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Itinerant Merchant, Class II', 'Itinerant Merchant')
train_data['LICENSE DESCRIPTION'] = train_data['LICENSE DESCRIPTION'].replace('Itinerant Merchant, Class I', 'Itinerant Merchant')
train_data['LICENSE DESCRIPTION'].value_counts()

In [None]:
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Motor Vehicle Repair : Engine Only (Class II)', 'Motor Vehicle Repair')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Motor Vehicle Repair: Engine/Body(Class III)', 'Motor Vehicle Repair')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Motor Vehicle Repair; Specialty(Class I)', 'Motor Vehicle Repair')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Day Care Center Under 2 Years', 'Day Care Center')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Day Care Center 2 - 6 Years', 'Day Care Center')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Day Care Center Under 2 and 2 - 6 Years', 'Day Care Center')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Peddler, non-food', 'Peddler')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Peddler, non-food, special', 'Peddler')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Peddler, food (fruits and vegtables only)', 'Peddler')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Peddler,food - (fruits and vegetables only) - special', 'Peddler')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Tire Facilty Class I (100 - 1,000 Tires)', 'Tire Facilty')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Tire Facility Class II (1,001 - 5,000 Tires)', 'Tire Facilty')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Tire Facility Class III (5,001 - More Tires)', 'Tire Facilty')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Repossessor Class A', 'Repossessor')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Repossessor Class B', 'Repossessor')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Repossessor Class B Employee', 'Repossessor')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Expediter - Class B', 'Expediter')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Expediter - Class A', 'Expediter')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Expediter - Class B Employee', 'Expediter')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Itinerant Merchant, Class II', 'Itinerant Merchant')
test_data['LICENSE DESCRIPTION'] = test_data['LICENSE DESCRIPTION'].replace('Itinerant Merchant, Class I', 'Itinerant Merchant')
test_data['LICENSE DESCRIPTION'].value_counts()

In [None]:
train_data['APPLICATION TYPE'].value_counts()

In [None]:
train_data['CONDITIONAL APPROVAL'].value_counts()

In [None]:
train_data.isna().sum()

In [None]:
train_data.dtypes

In [None]:
train_data['ZIP CODE'] = train_data['ZIP CODE'].fillna('None')
test_data['ZIP CODE'] = test_data['ZIP CODE'].fillna('None')

In [None]:
# train_data['WARD'] = train_data['WARD'].fillna(random.randint(1,47))
# test_data['WARD'] = test_data['WARD'].fillna(random.randint(1,47))
train_data['WARD'] = train_data['WARD'].fillna(0)
test_data['WARD'] = test_data['WARD'].fillna(0)

In [None]:
# train_data['PRECINCT'] = train_data['PRECINCT'].fillna(random.randint(1,76))
# test_data['PRECINCT'] = test_data['PRECINCT'].fillna(random.randint(1,76))
train_data['PRECINCT'] = train_data['PRECINCT'].fillna(0)
test_data['PRECINCT'] = test_data['PRECINCT'].fillna(0)

In [None]:
# train_data['POLICE DISTRICT'] = train_data['POLICE DISTRICT'].fillna(random.randint(1,25))
# test_data['POLICE DISTRICT'] = test_data['POLICE DISTRICT'].fillna(random.randint(1,25))
train_data['POLICE DISTRICT'] = train_data['POLICE DISTRICT'].fillna(0)
test_data['POLICE DISTRICT'] = test_data['POLICE DISTRICT'].fillna(0)

In [None]:
# train_data['SSA'] = train_data['SSA'].fillna(random.randint(1,69))
# test_data['SSA'] = test_data['SSA'].fillna(random.randint(1,69))
train_data['SSA'] = train_data['SSA'].fillna(0)
test_data['SSA'] = test_data['SSA'].fillna(0)

In [None]:
train_data['PAYMENT YEAR'] = train_data['PAYMENT YEAR'].fillna(round(train_data['PAYMENT YEAR'].mean()))
test_data['PAYMENT YEAR'] = test_data['PAYMENT YEAR'].fillna(round(test_data['PAYMENT YEAR'].mean()))

In [None]:
train_data['LICENSE APPROVED YEAR'] = train_data['LICENSE APPROVED YEAR'].fillna(round(train_data['LICENSE APPROVED YEAR'].mean()))
test_data['LICENSE APPROVED YEAR'] = test_data['LICENSE APPROVED YEAR'].fillna(round(test_data['LICENSE APPROVED YEAR'].mean()))

In [None]:
train_data['APPLICATION REQUIREMENTS COMPLETE YEAR'] = train_data['APPLICATION REQUIREMENTS COMPLETE YEAR'].fillna(round(train_data['APPLICATION REQUIREMENTS COMPLETE YEAR'].mean()))
test_data['APPLICATION REQUIREMENTS COMPLETE YEAR'] = test_data['APPLICATION REQUIREMENTS COMPLETE YEAR'].fillna(round(test_data['APPLICATION REQUIREMENTS COMPLETE YEAR'].mean()))

In [None]:
train_data.isna().sum()

In [None]:
test_data.isna().sum()

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.columns

In [None]:
train_data.dtypes

In [None]:
encoder = LabelEncoder()
train_data["CITY"] = encoder.fit_transform(train_data["CITY"])
train_data["ZIP CODE"] = encoder.fit_transform(train_data["ZIP CODE"].astype(str))
train_data["LICENSE DESCRIPTION"] = encoder.fit_transform(train_data["LICENSE DESCRIPTION"])
train_data["APPLICATION TYPE"] = encoder.fit_transform(train_data["APPLICATION TYPE"])
train_data["CONDITIONAL APPROVAL"] = encoder.fit_transform(train_data["CONDITIONAL APPROVAL"])
# train_data["LICENSE STATUS"] = encoder.fit_transform(train_data["LICENSE STATUS"])
train_data["LEGAL TYPE"] = encoder.fit_transform(train_data["LEGAL TYPE"])
train_data.head()

In [None]:
train_data["LICENSE STATUS"].value_counts()

In [None]:
train_data["LICENSE STATUS"] = train_data["LICENSE STATUS"].replace('AAI', 0)
train_data["LICENSE STATUS"] = train_data["LICENSE STATUS"].replace('AAC', 1)
train_data["LICENSE STATUS"] = train_data["LICENSE STATUS"].replace('REV', 2)
train_data["LICENSE STATUS"] = train_data["LICENSE STATUS"].replace('REA', 3)
train_data["LICENSE STATUS"] = train_data["LICENSE STATUS"].replace('INQ', 4)

In [None]:
encoder = LabelEncoder()
test_data["CITY"] = encoder.fit_transform(test_data["CITY"])
test_data["ZIP CODE"] = encoder.fit_transform(test_data["ZIP CODE"].astype(str))
test_data["LICENSE DESCRIPTION"] = encoder.fit_transform(test_data["LICENSE DESCRIPTION"])
test_data["APPLICATION TYPE"] = encoder.fit_transform(test_data["APPLICATION TYPE"])
test_data["CONDITIONAL APPROVAL"] = encoder.fit_transform(test_data["CONDITIONAL APPROVAL"])
test_data["LEGAL TYPE"] = encoder.fit_transform(test_data["LEGAL TYPE"])
test_data.head()

In [None]:
# Finding out the correlation between the features
corr = train_data.corr()
corr.shape

In [None]:
# Plotting the heatmap of correlation between features
plt.figure(figsize=(20,20))
sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='Greens')
plt.show()

In [None]:
train_data.head()

In [None]:
# Spliting target variable and independent variables
X = train_data.drop(['LICENSE STATUS'], axis = 1)
y = train_data['LICENSE STATUS']

In [None]:
train_data["LICENSE STATUS"].value_counts()

In [None]:
# Since our classes are highly skewed we should make them equivalent in order to have a normal distribution of the classes.

# Lets shuffle the data before creating the subsamples

train_data = train_data.sample(frac=1)

# Amount of fraud classes in proportion.
fraud_df = train_data[train_data['LICENSE STATUS'] == 4]
pending_df = train_data[train_data['LICENSE STATUS'] == 3]
accepted_df = train_data[train_data['LICENSE STATUS'] == 2]

non_fraud_df = train_data[train_data['LICENSE STATUS'] == 1][:600]
non_accepted_df = train_data[train_data['LICENSE STATUS'] == 0][:1000]

normal_distributed_df = pd.concat([fraud_df, pending_df, accepted_df, non_fraud_df, non_accepted_df])

# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)

new_df.head()

In [None]:
new_df['LICENSE STATUS'].value_counts()

In [None]:
plt.title('Distribution of target variable')
sns.countplot(new_df['LICENSE STATUS'])
plt.show()

In [None]:
# Entire DataFrame
corr = train_data.corr()
plt.figure(figsize=(20,20))
sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='Greens')
plt.title("Imbalanced Correlation Matrix \n (don't use for reference)", fontsize=14)
plt.show()

In [None]:
# Our Subsample
sub_sample_corr = new_df.corr()
plt.figure(figsize=(20,20))
sns.heatmap(sub_sample_corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':20}, cmap='Greens')
plt.title('Subsample Correlation Matrix \n (use for reference)', fontsize=14)
plt.show()

In [None]:
# Undersampling before cross validating (prone to overfit)
X = new_df.drop('LICENSE STATUS', axis=1)
y = new_df['LICENSE STATUS']

In [None]:
# Splitting the data into training set and testset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 42)

In [None]:
# # Turn the values into an array for feeding the classification algorithms.
# X_train = X_train.values
# X_test = X_test.values
# y_train = y_train.values
# y_test = y_test.values

# Logistic Regression

In [None]:
# Logistic Regression

# Import library for LogisticRegression
from sklearn.linear_model import LogisticRegression

# Create a Logistic regression classifier
logreg = LogisticRegression()

# Train the model using the training sets 
logreg.fit(X_train, y_train)

In [None]:
# Calculating cross validation score
training_score = cross_val_score(logreg, X_train, y_train, cv=10)

In [None]:
# Prediction on test data
y_pred = logreg.predict(X_test)

In [None]:
# Calculating the accuracy, precision and the recall
acc_logreg = round( metrics.accuracy_score(y_test, y_pred) * 100, 2 )
print( 'Total Accuracy : ', acc_logreg )

In [None]:
# Create confusion matrix function to find out sensitivity and specificity
from sklearn.metrics import confusion_matrix
def draw_cm(actual, predicted):
    cm = confusion_matrix( actual, predicted, [1,0]).T
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels = ["Yes","No"] , yticklabels = ["Yes","No"] )
    plt.ylabel('Predicted')
    plt.xlabel('Actual')
    plt.show()

In [None]:
# Confusion matrix 
draw_cm(y_test, y_pred)

# Gaussian Naive Bayes

In [None]:
# Gaussian Naive Bayes

# Import library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

# Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets 
model.fit(X_train,y_train)

In [None]:
# Prediction on test set
y_pred = model.predict(X_test)

In [None]:
# Calculating the accuracy, precision and the recall
acc_nb = round( metrics.accuracy_score(y_test, y_pred) * 100, 2 )
print( 'Total Accuracy : ', acc_nb )

In [None]:
# Confusion matrix 
draw_cm(y_test, y_pred)

# Decision Tree Classifier

In [None]:
# Decision Tree Classifier

# Import Decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# Create a Decision tree classifier model
clf = DecisionTreeClassifier()

# # Hyperparameter Optimization
# parameters = [{'max_features': ['log2', 'sqrt','auto'], 
#               'criterion': ['entropy', 'gini'],
#               'max_depth': [2, 3, 5, 10, 50], 
#               'min_samples_split': [2, 3, 50, 100],
#               'min_samples_leaf': [1, 5, 8, 10]
#              }
# # Run the grid search
# grid_obj = GridSearchCV(clf, parameters)
# grid_obj = grid_obj.fit(X_train, y_train)

# # Set the clf to the best combination of parameters
# clf = grid_obj.best_estimator_

# Train the model using the training sets 
clf.fit(X_train, y_train)

In [None]:
# Model prediction on train data
y_pred = clf.predict(X_train)

In [None]:
# Finding the variable with more importance
feature_importance = pd.DataFrame([X_train.columns, clf.tree_.compute_feature_importances()])
feature_importance = feature_importance.T.sort_values(by = 1, ascending=False)[1:10]

In [None]:
sns.barplot(x=feature_importance[1], y=feature_importance[0])
# Add labels to the graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

In [None]:
# Prediction on test set
y_pred = clf.predict(X_test)

In [None]:
# Confusion matrix
draw_cm(y_test, y_pred)

In [None]:
# Calculating the accuracy, precision and the recall
acc_dt = round( metrics.accuracy_score(y_test, y_pred) * 100, 2 )
print( 'Total Accuracy : ', acc_dt )

# Random Forest Classifier

In [None]:
# Random Forest Classifier

# Import library of RandomForestClassifier model
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest Classifier
rf = RandomForestClassifier()

# # Hyperparameter Optimization
# parameters = {'n_estimators': [4, 6, 9, 10, 15], 
#               'max_features': ['log2', 'sqrt','auto'], 
#               'criterion': ['entropy', 'gini'],
#               'max_depth': [2, 3, 5, 10], 
#               'min_samples_split': [2, 3, 5],
#               'min_samples_leaf': [1, 5, 8]
#              }

# # Run the grid search
# grid_obj = GridSearchCV(rf, parameters)
# grid_obj = grid_obj.fit(X_train, y_train)

# # Set the rf to the best combination of parameters
# rf = grid_obj.best_estimator_

# Train the model using the training sets 
rf.fit(X_train,y_train)

In [None]:
# Finding the variable with more importance
feature_imp = pd.Series(rf.feature_importances_,index= X_train.columns).sort_values(ascending=False)
# Creating a bar plot
feature_imp=feature_imp[0:10,]
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to the graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

In [None]:
# Prediction on test data
y_pred = rf.predict(X_test)

In [None]:
# Confusion metrix
draw_cm(y_test, y_pred)

In [None]:
# Calculating the accuracy, precision and the recall
acc_rf = round( metrics.accuracy_score(y_test, y_pred) * 100 , 2 )
print( 'Total Accuracy : ', acc_rf )

# SVM Classifier

In [None]:
# SVM Classifier

# Creating scaled set to be used in model to improve the results
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Import Library of Support Vector Machine model
from sklearn import svm

# Create a Support Vector Classifier
svc = svm.SVC()

# # Hyperparameter Optimization
# parameters = [
#   {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
#   {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
# ]

# # Run the grid search
# grid_obj = GridSearchCV(svc, parameters)
# grid_obj = grid_obj.fit(X_train, y_train)

# # Set the svc to the best combination of parameters
# svc = grid_obj.best_estimator_

# Train the model using the training sets 
svc.fit(X_train,y_train)

In [None]:
# Prediction on test data
y_pred = svc.predict(X_test)

In [None]:
# Confusion Matrix
draw_cm(y_test, y_pred)

In [None]:
# Calculating the accuracy, precision and the recall
acc_svm = round( metrics.accuracy_score(y_test, y_pred) * 100, 2 )
print( 'Total Accuracy : ', acc_svm )

# K - Nearest Neighbors

In [None]:
# K - Nearest Neighbors

# Import library of KNeighborsClassifier model
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN Classifier
knn = KNeighborsClassifier()

# # Hyperparameter Optimization
# parameters = {'n_neighbors': [3, 4, 5, 10], 
#               'weights': ['uniform', 'distance'],
#               'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
#               'leaf_size' : [10, 20, 30, 50]
#              }

# # Run the grid search
# grid_obj = GridSearchCV(knn, parameters)
# grid_obj = grid_obj.fit(X_train, y_train)

# # Set the knn to the best combination of parameters
# knn = grid_obj.best_estimator_

# Train the model using the training sets 
knn.fit(X_train,y_train)

In [None]:
# Prediction on test data
y_pred = knn.predict(X_test)

In [None]:
# Confusion Matrix
draw_cm(y_test, y_pred)

In [None]:
# Calculating the accuracy, precision and the recall
acc_knn = round( metrics.accuracy_score(y_test, y_pred) * 100, 2 )
print( 'Total Accuracy : ', acc_knn )

# XGBoost

In [None]:
# eXtreme Gradient Boosting

# Import library of xgboost model
from xgboost import XGBClassifier

# Create a XGB Classifier
# gbm = XGBClassifier(learning_rate = 0.02, n_estimators= 2000, max_depth= 4, min_child_weight= 2, gamma=1, subsample=0.8, 
#                     colsample_bytree=0.8, objective= 'binary:logistic', nthread= -1, scale_pos_weight=1)
gbm = XGBClassifier()

# Train the model using the training sets
gbm.fit(X_train, y_train)

In [None]:
# Prediction on test data
y_pred = gbm.predict(X_test)

In [None]:
# Calculating the accuracy, precision and the recall
acc_xgb = round( metrics.accuracy_score(y_test, y_pred) * 100 , 2 )
print( 'Total Accuracy : ', acc_xgb )

In [None]:
# Confusion matrix 
draw_cm(y_test, y_pred)

# Artificial Neural Network

In [None]:
# Spliting target variable and independent variables
X = train_data.drop(['LICENSE STATUS'], axis = 1)
y = train_data['LICENSE STATUS']

In [None]:
# Splitting the data into training set and testset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 0, stratify=y)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Importing the Keras libraries and packages
import keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Initializing the CNN
classifier = Sequential()

In [None]:
# Adding the input layer and the first hidden layer
classifier.add(Dense(10, init = 'uniform', activation = 'relu', input_dim = 18))

In [None]:
# Adding the second hidden layer
classifier.add(Dense(units = 10, init = 'uniform', activation = 'relu'))

In [None]:
# Adding the third hidden layer
classifier.add(Dense(units = 10, init = 'uniform', activation = 'relu'))

In [None]:
# Adding the fourth hidden layer
classifier.add(Dense(units = 10, init = 'uniform', activation = 'relu'))

In [None]:
# Adding the output layer
classifier.add(Dense(units = 1, init = 'uniform', activation = 'sigmoid'))

In [None]:
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
# Fitting the ANN to the training set
nn_history = classifier.fit(X_train, y_train, batch_size = 10, nb_epoch = 10, validation_data = (X_test, y_test))

In [None]:
plt.plot(nn_history.history['acc'])
plt.plot(nn_history.history['val_acc'])
plt.title('Accuracy')
plt.legend(['Train', 'Test'])
plt.show()

In [None]:
plt.plot(nn_history.history['loss'])
plt.plot(nn_history.history['val_loss'])
plt.plot('Loss')
plt.legend(['Train', 'Test'])
plt.show()

In [None]:
# Predicting the test set result
y_pred = classifier.predict(X_test)

In [None]:
acc_ann = nn_history.history['acc']
acc_ann = max(acc_ann)*100
y_pred

# Evaluation and comparision of all the models

In [None]:
# models = pd.DataFrame({
#     'Model': ['Logistic Regression', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'Support Vector Machines', 
#               'K - Nearest Neighbors', 'XGBoost', 'Artificial Neural Networks'],
#     'Score': [acc_logreg, acc_nb, acc_dt, acc_rf, acc_svm, acc_knn, acc_xgb, acc_ann]})
# models.sort_values(by='Score', ascending=False)

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'Support Vector Machines', 
              'K - Nearest Neighbors', 'XGBoost'],
    'Score': [acc_logreg, acc_nb, acc_dt, acc_rf, acc_svm, acc_knn, acc_xgb]})
models.sort_values(by='Score', ascending=False)

In [None]:
# Predicting on the test dataset
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
my_pred = gbm.predict(test_data)

In [None]:
my_pred.shape

In [None]:
my_submission['LICENSE STATUS'] = my_pred

In [None]:
my_submission.shape

In [None]:
my_submission['LICENSE STATUS'] = my_submission['LICENSE STATUS'].replace(0, 'AAI')
my_submission['LICENSE STATUS'] = my_submission['LICENSE STATUS'].replace(1, 'AAC')
my_submission['LICENSE STATUS'] = my_submission['LICENSE STATUS'].replace(2, 'REV')
my_submission['LICENSE STATUS'] = my_submission['LICENSE STATUS'].replace(3, 'REA')
my_submission['LICENSE STATUS'] = my_submission['LICENSE STATUS'].replace(4, 'INQ')

In [None]:
my_submission['LICENSE STATUS'].value_counts()

In [None]:
# Saving predictions to file
# my_submission.to_csv('submission.csv', index=False)

In [None]:
train_data[train_data['LICENSE STATUS'] == 4]