## Blight Ticket Compliance Prediction 


In [None]:
# import the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_curve, auc, plot_roc_curve
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

### Loading the dataset

In [None]:
path = '../input/blight-violations-final/Blight_Violations_Final.csv'

Dataset = pd.read_csv(path)

In [None]:
# General view of the raw data
print(Dataset.shape)
(Dataset.head())

### Data pre processing 

In [None]:
# There are quite number of redundant columns which have either no data or no meaningful information
# Lets check those columns
Dataset.isna().sum()

In [None]:
# X and Y columns are irrelevatnt for modelling, also column cleanup_cost amount, payment_amount columns have abosolutely no data
Dataset = Dataset.drop(['X','Y'],axis =1)
Dataset.shape

Since we need to find the whether a person will be compliant or not, we need to train our model with data which contains people who were found guilty


In [None]:
Dataset.disposition.unique()

The disposition terms 'by', 'by Default' and 'Responsible by' seems ambiguous
 Lets check the data for these instances 

In [None]:
#For disposition 'Responsible by'
Dataset[Dataset['disposition'] == 'Responsible by'] 

In [None]:
#For disposition 'by Default'
Dataset[Dataset['disposition'] == 'by Default'] 

In [None]:
#For disposition 'by'
Dataset[Dataset['disposition'] == 'by'] 

In [None]:
# Code for checking the data for 'by' cases

print(Dataset[Dataset['disposition'] == 'by'].shape)
Dataset[Dataset['disposition'] == 'by'].isnull().sum()

From the above code, it can be clearly stated that all cases having  disposition as 'by' have no judgement date, therefore the records seems ambiguous for analysis

Therefore, the dataset needs to be filterd for only those who are held responsible

In [None]:
cases = ['Responsible by Default',
        'Responsible by Admission',
       'Responsible by Determination',
       'Responsible (Fine Waived) by Determination',
       'Responsible (Fine Waived) by Admission',
       'Responsible - Compl/Adj by Default',
       'Responsible - Compl/Adj by Determination',
       'Responsible by Dismissal',
       'Responsible (Fine Waived) by City Dismissal']

Compliant_dataset = Dataset.loc[Dataset['disposition'].isin(cases)] 
Compliant_dataset.shape

The next task is to define whether a person is compliant or not

For that, following conditions must be applicale for a person to be compliant:
* The person pays fine within 6 months after judgement date
* The person pays full amount for fine
* The person pays fine prior to judgement (By own admission)



In [None]:
conditions = [
    ((Compliant_dataset['payment_status'] == 'NO PAYMENT DUE')
     | ((Compliant_dataset['payment_status'] == 'PAID IN FULL') & (pd.to_datetime(Compliant_dataset['payment_date']).dt.to_period('M').astype(int) - pd.to_datetime(Compliant_dataset['judgment_date']).dt.to_period('M').astype(int)  <= 6))),
    ((Compliant_dataset['payment_status'] == 'PARTIAL PAYMENT APPLIED') | (Compliant_dataset['payment_status'] =='NO PAYMENT APPLIED')
     | (pd.isna(Compliant_dataset['payment_status']) == True) | (Compliant_dataset['payment_status'].isnull() == True) | (pd.to_datetime(Compliant_dataset['payment_date']).dt.to_period('M').astype(int) - pd.to_datetime(Compliant_dataset['judgment_date']).dt.to_period('M').astype(int)  > 6))
       ]

# create a list of the values we want to assign for each condition
values = ['Compliant', 'Non_Compliant']

# create a new column and use np.select to assign values to it using our lists as arguments
Compliant_dataset['Compliance'] = np.select(conditions, values)

# display updated DataFrame
Compliant_dataset.head()

In [None]:
Compliant_dataset.shape

#### Handling the missing values

In [None]:
# Handle missing values
Compliant_dataset.isnull().sum()

In [None]:
# check the missing value by bar chart

msno.bar(Compliant_dataset)

#### It can be seen in the graph that some features has very less values, we can remove those features/columns.

In [None]:
# removing the features having very less values 
#Also removing keys (Primary keys such as ticket_id) 
Compliant_dataset_new =  Compliant_dataset.drop(['ticket_id','ticket_number',"violation_zip_code", "non_us_str_code", "country", "payment_amount",
                                                 "collection_status", "parcelno", "clean_up_cost",'oid'], axis=1)

In [None]:
# again check the missing values 
Compliant_dataset_new.isnull().sum()

Now only payment_date and payment_status have large missing values.
As these attributes are important, we can't remove them directly. 

There are cases where payment is not made and the status due to which is null , therefore payment_date will also be null,let replace those payment_date to year 2099

In [None]:
# Removing payment_date will null values for valid payment status
Compliant_dataset_new = Compliant_dataset_new.drop(Compliant_dataset_new[(Compliant_dataset_new['payment_date'].isnull() == False )
                                                                         & (Compliant_dataset_new['payment_status'].isnull() == True)].index)
# For Compliant records with no payment due, replacing the payment date with judgment date
Compliant_dataset_new.loc[Compliant_dataset_new['payment_status'] =='NO PAYMENT DUE', 'payment_date'] = Compliant_dataset_new['judgment_date']


For the remaining payment status is null because no payment is done for non compliant, so replacing the null with 'No payment date'. Similarly, for payment_date we are filling with 2099 year and will consider this year as a representation for non compliance date

In [None]:
Compliant_dataset_new["payment_status"].fillna("NO PAYMENT MADE", inplace = True)
Compliant_dataset_new["payment_date"].fillna("2099/12/12 00:00:00+00", inplace = True)

In [None]:
#Dumping the value for remaining nul records
Compliant_dataset_new = Compliant_dataset_new.dropna()

In [None]:
Compliant_dataset_new.isnull().sum()

#### To understand the recent trends and patterns, the dataset is been filtered for violations occured from 2018

In [None]:

Compliant_dataset_new=Compliant_dataset_new.loc[(Compliant_dataset_new['violation_date'] >= '2018-01-01')]

In [None]:
Compliant_dataset_new.shape

## Visualizations

In [None]:
# Count of Non-Compliance vs Compliance in the dataset

Compliant_dataset_new.Compliance.value_counts().plot(kind='bar', title='Count (compliance)', color=['red', 'green']);

In [None]:
# Creating a dataset to check how many non compliants pay their fine in upcoming months (i.e after the period of 6 months)

import datetime
Compliant_dataset_new['month_gap']= (pd.to_datetime(Compliant_dataset['payment_date']).dt.to_period('M').astype(int) - pd.to_datetime(Compliant_dataset['judgment_date']).dt.to_period('M').astype(int))
C = Compliant_dataset_new.loc[(Compliant_dataset_new['month_gap'] > 6)  & (Compliant_dataset_new['month_gap'] < 100)]
C = C.loc[C['payment_status'] =='PAID IN FULL']


In [None]:
plt.figure(figsize = (15,5))
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
#sns.axes_style('white')
sns.set_context("talk")
sns.countplot(x="month_gap", data=C,palette="Greens_d")
plt.xlabel('Months after compliance period is over')
plt.ylabel('Non_compliant who paid fines')

In [None]:
# Checking ratio of compliance if discount is provided 
D = Compliant_dataset_new.loc[(Compliant_dataset_new['discount_amount'] > 0)]
E =Compliant_dataset_new.loc[(Compliant_dataset_new['discount_amount'] == 0)]

In [None]:

f, axes = plt.subplots(1, 2, figsize=(18,5))

f.suptitle('Compliance Measure based on discount')

sns.countplot(x="payment_status", hue = 'Compliance',data=D, ax = axes[0])
axes[0].set_title('Discount provided')
axes[0].set(xlabel="Payment Status", ylabel = "Count of People")
sns.countplot(x="payment_status", hue = 'Compliance',data=E,  ax = axes[1])
axes[1].set_title('Discount Not provided')
axes[1].set(xlabel="Payment Status", ylabel = "Count of People")
sns.set_palette('dark')

In [None]:

sns.relplot(data = Compliant_dataset_new, x = 'fine_amount', y = 'discount_amount', hue = 'Compliance')
sns.reset_defaults()
plt.title('Fine to Discount Relation')
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
plt.xlabel('Fine Amount')
plt.ylabel('Discount Amount')

In [None]:
#convert all values to lowercase helps standardize city names
Compliant_dataset_new['city'] =Compliant_dataset_new['city'].str.lower()
Compliant_dataset_new['city'] = Compliant_dataset_new['city'].replace('det','detroit')
Compliant_dataset_new['city'] = Compliant_dataset_new['city'].replace(['mendota hts., ','mendota hts.,','mendota hghts.', 'mendota   heights','menndota  heights','mendata  hgt','mendeta height'],'mendota heights')
Compliant_dataset_new['city'] = Compliant_dataset_new['city'].replace('menomonee','menomonee falls')

In [None]:
# Count of compliance cases 
Compliant_cities=Compliant_dataset_new[Compliant_dataset_new['Compliance']=='Compliant'].groupby('city')['Compliance'].count()
head = Compliant_cities.sort_values(ascending=False).head(5)
sns.reset_defaults()
sns.barplot(head.index, head.values)
plt.xticks(rotation=45)

In [None]:
Non_Compliant_cities=Compliant_dataset_new[Compliant_dataset_new['Compliance']=='Non_Compliant'].groupby('city')['Compliance'].count()
#visualize the most 5 Non compliant cities
head1 = Non_Compliant_cities.sort_values(ascending=False).head(5)
sns.barplot(head1.index, head1.values)
plt.xticks(rotation=45)

In [None]:
Compliant_df= pd.DataFrame({'city': Compliant_cities.index,     
                           'Compliant_counts':Compliant_cities.values})

In [None]:
Non_Compliant_df= pd.DataFrame({'city': Non_Compliant_cities.index,     
                           'Non Compliant_counts': Non_Compliant_cities.values})

In [None]:
result = pd.concat([Compliant_df, Non_Compliant_df], axis=1)
result['Compliant_counts'] = result['Compliant_counts'].fillna(0)


In [None]:
#visualize compliant and non-complaint rate of 5 top cities
sns.reset_defaults()
x = np.arange(len(head.index))  # the label locations
width = 0.35  # the width of the bars
sns.set_style('darkgrid')
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, head.values, width, label='Complaint')
rects2 = ax.bar(x + width/2, head1.values, width, label='Non-complaint')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Count')
ax.set_title('Compliant and Non-complaint rate of 5 top cities')
ax.set_xticks(x)
ax.set_xticklabels(head.index)
ax.legend()
fig.tight_layout()
plt.show()

In [None]:
sns.set_style('darkgrid')
sns.countplot(data=Compliant_dataset_new, x="agency_name", hue="Compliance", palette ='plasma')
plt.xticks(rotation=45)

### Feature Importance (Mutual Information)

In [None]:
columns = ['agency_name','inspector_name','violator_name','violation_street_number','violation_street_name','violator_id',
'mailing_address_str_name','city','state','violation_date','ticket_issued_time',
'hearing_date','hearing_time','judgment_date','violation_code','violation_description',
'disposition','fine_amount','admin_fee','state_fee','late_fee','discount_amount','judgment_amount',
'balance_due','payment_date','payment_status','violation_address','Compliance']


In [None]:
Compliant_dataset_new.shape

In [None]:
Compliant_dataset_new_labeled  = Compliant_dataset_new.copy()

In [None]:
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
for i in columns:
    Compliant_dataset_new_labeled[i]= label_encoder.fit_transform(Compliant_dataset_new[i])

In [None]:
Compliant_dataset_new_labeled.head(5)

In [None]:

from sklearn.ensemble import AdaBoostClassifier
X = Compliant_dataset_new_labeled.drop(['Compliance','mailing_address_str_number','zip_code','violator_id'],axis =1)
y = Compliant_dataset_new_labeled['Compliance']
# Build a forest and compute the feature importances
forest = AdaBoostClassifier(n_estimators=100)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)

# Plot the feature importances of the forest
plt.figure(figsize=(20,15))
plt.title("Feature importances")
plt.barh(range(X.shape[1]), importances[indices],
       color="r", xerr=std[indices], align="center")
# If you want to define your own labels,
# change indices to a list of labels on the following line.
plt.yticks(range(X.shape[1]), X.columns)
plt.ylim([-1, X.shape[1]])
plt.show()

In [None]:
# Checking best features for our label
from sklearn.feature_selection import mutual_info_classif,mutual_info_regression
importances = mutual_info_classif(X,y)


In [None]:
plt.figure(figsize=(20,10)) 
feat_import = pd.Series(importances, X.columns)
feat_import.plot(kind='barh', color ='teal')
plt.show()

In [None]:
importances1 =  mutual_info_regression(X,y)
plt.figure(figsize=(20,10)) 
feat_import = pd.Series(importances1, X.columns)
feat_import.plot(kind='barh', color ='darkorange')
plt.show()

### Feature Selection

Based on feature importances, selecting the required features for modelling

In [None]:
features = Compliant_dataset_new_labeled[['violator_name','violation_street_name','mailing_address_str_name','judgment_date','disposition','discount_amount','judgment_amount',
'balance_due','late_fee','payment_date','payment_status','violation_address']]
label = Compliant_dataset_new_labeled['Compliance']

### Scaling of datasets

In [None]:
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()

standardScaler.fit(features)
scaled_features = standardScaler.transform(features)
Features = pd.DataFrame(scaled_features,columns=features.columns)

### Splitting of datasets into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Features,label,test_size=0.2,random_state=10)

### Modelling

In [None]:
# Logistic Regression Model
logicRe=LogisticRegression()
logicRe.fit(X_train,y_train)
pred1 = logicRe.predict(X_test)
A1 = round(accuracy_score(y_test, pred1, normalize = True)*100,2)
print(" Logistic Regression accuracy is",A1)

In [None]:
#Naive Bayes Model
gnb=GaussianNB()
gnb.fit(X_train, y_train)
pred2 =gnb.predict(X_test)
A2 = round(accuracy_score(y_test, pred2, normalize = True)*100,2)
print("Naive-Bayes accuracy without noise is",A2)

In [None]:
from sklearn import svm
SVC=svm.SVC(kernel='linear',C=0.1,gamma=0.1)
SVC.fit(X_train, y_train)
pred3 =SVC.predict(X_test)
A3 = round(accuracy_score(y_test, pred3, normalize = True)*100,2)
print("SVM accuracy without noise is",A3)

In [None]:
clf_SGD = SGDClassifier(loss='hinge', penalty='l2', max_iter=100)
clf_SGD.fit(X_train, y_train)
pred4 =clf_SGD.predict(X_test)
A4 = round(accuracy_score(y_test, pred4, normalize = True)*100,2)
print("stochastic gradient descent (SGD) Classifier accuracy without noise is",A4)

### Confusion Matrix

In [None]:
sns.reset_defaults()
from sklearn.metrics import plot_confusion_matrix
classifiers = [logicRe,gnb, SVC, clf_SGD]

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))

for cls, ax in zip(classifiers, axes.flatten()):
    plot_confusion_matrix(cls, 
                          X_test, 
                          y_test, 
                          ax=ax, 
                          cmap='Blues')
    ax.title.set_text(type(cls).__name__)
plt.tight_layout()  
plt.show()

### Classification Reports

In [None]:
print('Classification Report for Logistic Regression')
print(classification_report(y_test,pred1))
print('Classification Report for Naive Bayes')
print(classification_report(y_test,pred2))
print('Classification Report for SVM')
print(classification_report(y_test,pred3))
print('Classification Report for SGD CLassifer')
print(classification_report(y_test,pred4))

### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
k = 10

In [None]:
cv_result = cross_val_score(logicRe,X_test,y_test,cv=k) # uses R^2 as score 
print('CV Scores for Logistic Regression is: ',cv_result)
print('CV Average score for Logistic regression is: ',np.sum(cv_result)/k)

In [None]:
cv_result = cross_val_score(gnb,X_test,y_test,cv=k) # uses R^2 as score 
print('CV Scores for Naive Bayes is: ',cv_result)
print('CV Average score for Naive Bayes is: ',np.sum(cv_result)/k)

In [None]:
cv_result = cross_val_score(SVC,X_test,y_test,cv=k) # uses R^2 as score 
print('CV Scores for SVC is: ',cv_result)
print('CV Average score for SVC is: ',np.sum(cv_result)/k)

In [None]:
cv_result = cross_val_score(clf_SGD,X_test,y_test,cv=k) # uses R^2 as score 
print('CV Scores for SGD Classifier is: ',cv_result)
print('CV Average score for SGD Classifier is: ',np.sum(cv_result)/k)

### AUC-ROC Curve

In [None]:
def roCurves(clfList, X_dev, y_dev):
    
    roCurveList = []
    plt.subplots(1, 1, figsize=(5, 5))
    styleList = ['solid','dashed','dotted', 'dashed','dashdot']
    
    for clf, sty in zip(clfList, styleList):
        ax = plt.gca()
        roc = plot_roc_curve(clf, X_dev, y_dev, ax=ax, alpha=0.85, lw=2, linestyle=sty)
        roCurveList.append(roc)
    plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='dotted')
    plt.title('ROC')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    return roCurveList

In [None]:
exps = [logicRe, gnb, SVC, clf_SGD]

roCurves(exps, X_test, y_test)

# Save the figure and show
plt.tight_layout()
#plt.savefig('plots/ROCs.png')
plt.show()

#### Hence to conclude, for this dataset Support Vector Classification is model the team would suggest.

#### Other noted highlights through EDA were:
#### --If a certain discount is provided in fine, the chances of person being compliant is very high
#### --If a person is non-compliant, the maximum timeline to expect for him/her to pay is 26 months, after which he can be listed as a permanent defaulter
#### --Most non compliance occurs in high fines
