In [None]:
# Importation of Data manipulation libraries
import pandas as pd
import numpy as np
# Importation of visualisation libraries
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV,train_test_split,RandomizedSearchCV
# Imputing missing values and scaling values
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

The Objective of this task is to predict permit issue times of various building permits so as to identify which ones matter more and also be able to draw conclusions on the city's development plans based on the San Francisco Building Permits data.
It's a Regression problems since the permit issue time will be a continuous variable.
It's a supervised Machine learning problem since we have access to the features and the target variable

In [None]:
pd.set_option('display.max_columns',None)
Pmtsdata = pd.read_csv('../input/building-permit-applications-data/Building_Permits.csv',
                      index_col='Permit Number',parse_dates=['Filed Date','Issued Date'])

In [None]:
#Phase 1:Data cleaning and formatting.
Pmtsdata.head()

In [None]:
Pmtsdata.shape

In [None]:
# Gives us some more information about the dataframe .
Pmtsdata.info()

In [None]:
#Exploring missing values
missing_values_counts = Pmtsdata.isnull().sum()
print(missing_values_counts)

In [None]:
#if we were to drop columns with atleast one missing value
columns_without_missing_values= Pmtsdata.dropna(axis=1)
columns_without_missing_values.head()
#Alot of data is lost. Out of 42 columns only 11 are retained, 31 columns are lost

In [None]:
#if we were to drop rows with atleast one missing value
rows_without_missing_values= Pmtsdata.dropna(axis=0)
rows_without_missing_values.head()
#All the data would be lost

In [None]:
#lets explore the possibility of dropping columns by a given percentage threshold of missings values.
miss_val_per_column = Pmtsdata.isnull().sum()/len(Pmtsdata)
miss_val_per_column

In [None]:
mis_val_centage = (miss_val_per_column*100).round(3)
mis_val_centage

In [None]:
mis_val_table = pd.concat([miss_val_per_column,mis_val_centage,],axis=1)
mis_val_table

In [None]:
new_table = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : 'Percentage'})
new_table

In [None]:
# set our threshold to remove columns with 80% missing values.
missing_columns = list(new_table[new_table['Percentage']>80].index)
print('We will remove %d columns'%len(missing_columns))
print('The columns to remove are \n %s'%missing_columns)

In [None]:
Pmtsdata.drop(columns=list(missing_columns),inplace=True)
#Now left with 34 columns out of 42.

In [None]:
Pmtsdata.shape

**Standard machine learning models cannot deal with missing values, and which means we have to find a way to fill these in or disard any features with missing values. Since we already removed features with more than 80% missing values in the dateframe,
we have a considerable number of rows process, we can decide to drop rows with missing values so as to remain with data which suitable to our models**.

In [None]:
Pmtsdata.dropna(axis=0,inplace=True) # we can drop rows with missing values

In [None]:
Pmtsdata.isnull().any().sum() #code shows we now have no missing values in the dataframe

In [None]:
Pmtsdata.shape # we are still left with a considerable chunk of data to build our models.

In [None]:
#DUPLICATE CHECK.
NewPmtdata=Pmtsdata.copy()
NewPmtdata.drop_duplicates(subset=None, inplace=True)

In [None]:
NewPmtdata.shape  # no duplicate rows present

**we shall compute how long it takes for a building permit to be issued by creating the target variable time in days from the Filed Date and the Issued Date **

In [None]:
#create a new column time in days taken to receive a permit from the filed date and issued date
Pmtsdata['Time_in_Days']= Pmtsdata['Issued Date'].sub(Pmtsdata['Filed Date'],axis=0)

In [None]:
Pmtsdata['Time_in_Days'].head()# Asnap shot on the time in days taken.

In [None]:
#Convert the time in days from Datetime format to integer
Pmtsdata['Time_in_Days']=Pmtsdata['Time_in_Days'].dt.days

**Phase 2: 
Next step is to do exploratory Data analysis whose purpose is to find anormalies, trends, partners and relationships that can be used to inform modeling decisions such as which features to use with strong correlation. This helps us to determine what our data tell us**

In [None]:
Pmtsdata['Time_in_Days'].describe() # exploring our target variable

In [None]:
Pmtsdata.loc[Pmtsdata['Time_in_Days']== 1262] #exploring a row of interest
# Permit type with longest issue time is of category 3 which involves additions, alterations or repairs

In [None]:
Pmtsdata.loc[Pmtsdata['Time_in_Days']== 0][:5] 
# Permit type with shortest issue time are majorly of category 8

In [None]:
count_in_days=Pmtsdata['Time_in_Days'].value_counts(sort=True)

In [None]:
print(count_in_days.head()) # The highest number of permits are processed in a few hours.

In [None]:
#we also have a number single permit recordes with high processing times.
print(count_in_days.tail())


In [None]:
plt.hist(count_in_days, bins = 50, edgecolor = 'k');
plt.xlabel('Time Taken in Days'); plt.ylabel('Count of records'); 
plt.title('Count of days distribution')
plt.show()
#It's a highly skewed distribution of the time taken for various permits to be processed

In [None]:
sns.kdeplot(data=count_in_days,shade=False,alpha=0.8)
plt.show() #gives a clear representation of skewness in the time in days taken by various permits

**Visualizing some categorical columns:Permit Type,Permit Type Definition,Current Status, Existing Construction Type **

In [None]:
# Lets do some bivariate plotting
fig=plt.figure(figsize=(8,6))
sns.barplot(x=Pmtsdata['Permit Type'],y=Pmtsdata['Time_in_Days'],hue='Permit Type',data=Pmtsdata)
plt.xlabel('Permit Type')
plt.xticks(size=14)
plt.ylabel('Time in Days')
plt.title('Time in days for various  permit types')

plt.show();

It can be observed that Permit type of category 8 take the least time to process,followed by permit type 3 while permit type 2  take the most time on average to process.

In [None]:
fig=plt.figure(figsize=(8, 8))
plt.hist(Pmtsdata['Permit Type'], bins = 20, edgecolor = 'black');
plt.xlabel('Permit type'); 
plt.ylabel('Count'); plt.title('Permit Type Distribution');

The histogram above shows that other permit types categories 2,4,5,6,7 do not have a significant count of records.
Permit types categories 8 and 3 are the common permit types to be issued.

Lets Explore the current status variable with emphasis to key values such as issued, revoked and incomplete permit applicatons.

In [None]:
Pmtdata1 =Pmtsdata[Pmtsdata['Current Status'].isin(['issued','revoked','incomplete']) ] 

In [None]:
Pmtdata1['Current Status'].head()

In [None]:
# plot both together to compare
fig=plt.figure(figsize=(8,6))
sns.catplot(x='Permit Type',hue='Current Status',kind='count',data=Pmtdata1)
plt.xlabel('Current Status')
plt.xticks()
plt.ylabel('Number of permits')
plt.title('Current status  permit types')
plt.xticks()
plt.show()

 **The Bar Graph above shows that there are more permits that are issued, a few permits that are revoked and a slight number of them that are incomplete meaning that there are more chances for a permit to be issued once all required documents have been submitted.**

In [None]:
# A Count plot showing distribution of Permit Type Definition with permit types
fig=plt.figure(figsize=(8,6))
sns.catplot(y='Permit Type Definition',hue='Permit Type',kind='count',data=Pmtsdata)
plt.xlabel('Permit counts')
plt.xticks()
plt.ylabel('Permit Type Definition')
plt.title('Number of permits per permit type definition')
plt.show()

**Graph above shows that Alterations permits are the common permits type definition in the data with other permits definitions greatly skewed.**

In [None]:
# plot both together to compare
fig=plt.figure(figsize=(8,6))
sns.catplot(x='Permit Type',hue='Existing Construction Type',kind='count',data=Pmtsdata)
plt.xlabel('Permit Types')
plt.ylabel('Count of Existing Construction types')
plt.title('Count of Permit types for existing construction types')
plt.xticks()
plt.show()

**The most construction types permits issued belong to Existing Construction type 5.0 which is reflected across the major permit types 8 and 3, then followed by construction type 1.0 ,3.0,2.0 and least number of permits belong to construction type 4.0.**

**Visualise highly skewed columns which include: estimated cost, revised cost, plansets. and decrease on thier skewness so as to produce better models.**

In [None]:
sns.kdeplot(data=Pmtsdata['Estimated Cost'] ,shade=False,alpha=0.8)
plt.show()
#graph below shows a skewed distribution to right

In [None]:
# Calculate first and third quartile
first_quartile = Pmtsdata['Estimated Cost'].describe()['25%']
third_quartile = Pmtsdata['Estimated Cost'].describe()['75%']

# Interquartile range
iqr = third_quartile - first_quartile

# Remove outliers
Pmtsdata = Pmtsdata[(Pmtsdata['Estimated Cost'] > (first_quartile - 3 * iqr)) &
            (Pmtsdata['Estimated Cost'] < (third_quartile + 3 * iqr))]

In [None]:
sns.kdeplot(data=Pmtsdata['Estimated Cost'] ,shade=False,alpha=0.8)
plt.show() #this shows a better shape with decreased skewness

In [None]:
sns.kdeplot(data=Pmtsdata['Plansets'],shade=False,alpha=0.8)
plt.show()

In [None]:
# Calculate first and third quartile
first_quartile = Pmtsdata['Plansets'].describe()['25%']
third_quartile = Pmtsdata['Plansets'].describe()['75%']

# Interquartile range
iqr = third_quartile - first_quartile

# Remove outliers
Pmtsdata = Pmtsdata[(Pmtsdata['Plansets'] > (first_quartile - 3 * iqr)) &
            (Pmtsdata['Plansets'] < (third_quartile + 3 * iqr))]

In [None]:
sns.kdeplot(data=Pmtsdata['Plansets'],shade=False,alpha=0.8)
plt.show()
#shows decrease in skewness

In [None]:
sns.kdeplot(data=Pmtsdata['Revised Cost'],shade=False,alpha=0.8)
plt.show()

In [None]:
# Calculate first and third quartile
first_quartile = Pmtsdata['Revised Cost'].describe()['25%']
third_quartile = Pmtsdata['Revised Cost'].describe()['75%']

# Interquartile range
iqr = third_quartile - first_quartile

# Remove outliers
Pmtsdata = Pmtsdata[(Pmtsdata['Revised Cost'] > (first_quartile - 3 * iqr)) &
            (Pmtsdata['Revised Cost'] < (third_quartile + 3 * iqr))]

In [None]:
sns.kdeplot(data=Pmtsdata['Revised Cost'],shade=False,alpha=0.8)
plt.show()

In [None]:
sns.kdeplot(data=Pmtsdata['Existing Units'],shade=False,alpha=0.8)
plt.show()

In [None]:
# Calculate first and third quartile
first_quartile = Pmtsdata['Existing Units'].describe()['25%']
third_quartile = Pmtsdata['Existing Units'].describe()['75%']

# Interquartile range
iqr = third_quartile - first_quartile

# Remove outliers
Pmtsdata = Pmtsdata[(Pmtsdata['Existing Units'] > (first_quartile - 3 * iqr)) &
            (Pmtsdata['Existing Units'] < (third_quartile + 3 * iqr))]

In [None]:
sns.kdeplot(data=Pmtsdata['Existing Units'],shade=False,alpha=0.8)
plt.show()

In [None]:
sns.kdeplot(data=Pmtsdata['Proposed Units'],shade=False,alpha=0.8)
plt.show()

In [None]:
# Calculate first and third quartile
first_quartile = Pmtsdata['Proposed Units'].describe()['25%']
third_quartile = Pmtsdata['Proposed Units'].describe()['75%']

# Interquartile range
iqr = third_quartile - first_quartile

# Remove outliers
Pmtsdata =Pmtsdata[(Pmtsdata['Proposed Units'] > (first_quartile - 3 * iqr)) &
            (Pmtsdata['Proposed Units'] < (third_quartile + 3 * iqr))]

In [None]:
sns.kdeplot(data=Pmtsdata['Proposed Units'],shade=False,alpha=0.8)
plt.show()

In [None]:
Pmtsdata.shape #New shape after removing outliers.

In [None]:
correlation_matrix =Pmtsdata.corr()['Time_in_Days'].sort_values()

In [None]:
#printing the most negative and least correlations
correlation_matrix.head(7)

In [None]:
correlation_matrix.tail(7)

In [None]:
types = (Pmtsdata['Permit Type'].value_counts())
types

In [None]:
types = list(types[types.values > 300].index)
types

In [None]:
fig=plt.figure(figsize=(8,6))
plt.hist(types,bins=50)
plt.show()

In [None]:
# Plot of distribution of time in days for permit types
plt.figure(figsize=(10,8))

# Plot each permit type
for p_type in types:
    
    # Select the permit type
    subset = Pmtsdata[Pmtsdata['Permit Type'] == p_type]
    
    # Density plot of Energy Star scores
    sns.kdeplot(subset['Time_in_Days'],
               label = p_type, shade = False, alpha = 0.8);
    
# label the plot
plt.xlabel('Time in Days', size = 8); plt.ylabel('Density', size = 8); 
plt.title('Density Plot of Time in Days by Permit Type', size = 8);

In [None]:
cat_col = ['Permit Type','Street Number','Existing Construction Type','Zipcode','Supervisor District']

In [None]:
Pmtsdata[cat_col]=Pmtsdata[cat_col].astype('str')

In [None]:
Pmtsdata.dtypes # to verify that our data types

**Dropping of Filed Date and Issued dates since they are nolonger informative in our modelling,
We drop Record ID since  we already have a unique record identifier,
we also drop locationn since it's not so informative as we already have a supervisor district which gives us a hint on the area the building is located,
we also drop columns that could be colinear to other columns in the data and hence lead to overfitting
These include:Number of existing stories,estimated cost,existing units**

In [None]:
Pmtsdata.drop(columns =list(['Filed Date','Issued Date','Record ID','Location',
                           'Number of Existing Stories','Estimated Cost',
                           'Existing Units','Current Status']),axis=1,inplace=True) 

In [None]:
Pmtsdata.head(1)# shows that we have dropped the above mentioned columns.

In [None]:
Pmtsdata.shape

In [None]:
y=pd.DataFrame(Pmtsdata['Time_in_Days'])
X =Pmtsdata.drop(['Time_in_Days'],axis=1)

In [None]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality
categorical_cols = [cname for cname in X.columns 
                    if X[cname].nunique() < 10 and X[cname].dtype == "object"]

In [None]:
categorical_colsOH= pd.get_dummies(X[categorical_cols])

In [None]:
categorical_colsOH.shape

In [None]:
categorical_colsOH.head()

In [None]:
from sklearn.preprocessing import  StandardScaler
# Select numerical columns
numerical_cols = [cname for cname in X.columns 
                  if X[cname].dtype in ['int64', 'float64']]

scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols] )

In [None]:
#The code below shows that the numerical columns have minimum skewness.
X[numerical_cols].skew(axis=0)

In [None]:
numerical_data = pd.DataFrame(X[numerical_cols])

In [None]:
numerical_data.head()# taking a snapshot of the numerical data columns

In [None]:
features =pd.concat([categorical_colsOH, numerical_data],axis=1) 

In [None]:
features.head()

In [None]:
features.shape

In [None]:
# Convert y to one-dimensional array (vector)
y = np.array(y).reshape((-1, ))

In [None]:
# Break off test set from training data
X_train, X_test, y_train, y_test =train_test_split(features,y,test_size=0.2,random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
# Function to calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

In [None]:
#Now we can make the median guess and evaluate it on the test set to obtain our baseline model
baseline_guess = np.median(y)
#This shows our average estimate on the test set is off by about 13 points. 
#The Time in days can take on any values, the average error from a naive method if about 13%. 
#The naive method of guessing the median training value provides us a low baseline for our models to beat

print('The baseline guess of number of days taken: %0.2f days' % baseline_guess)
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))

In [None]:
# function to  train a given  model and evaluate it on the test set
def fit_and_evaluate(model):
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions and evalute
    model_pred = model.predict(X_test)
    model_mae = mae(y_test, model_pred)
    
    # Return the performance metric
    return model_mae

In [None]:
lr = LinearRegression()
lr_mae = fit_and_evaluate(lr)

print('Linear Regression Performance on the test set: MAE = %0.4f' % lr_mae)

In [None]:
random_forest = RandomForestRegressor(random_state=0)
random_forest_mae = fit_and_evaluate(random_forest)

print('Random Forest Regression Performance on the test set: MAE = %0.4f' % random_forest_mae)

In [None]:
gradient_boosted = GradientBoostingRegressor(random_state=4)
gradient_boosted_mae = fit_and_evaluate(gradient_boosted)

print('Gradient Boosted Regression Performance on the test set: MAE = %0.4f' % gradient_boosted_mae)

In [None]:
# Number of trees used in the boosting process
n_estimators = [100, 500, 900, 1100, 1500]

#loss function to be minimized
loss = ['ls', 'lad', 'huber']

# Maximum depth of each tree
max_depth = [2, 3, 5, 10, 15]
#how much the contribution of each tree will shrink.

learning_rate = [0.005,0.01,0.05,0.1,0.5]

# Minimum number of samples to split a node
min_samples_split = [2, 4, 6, 10]

# Maximum number of features to consider for making splits
max_features = ['auto', 'sqrt', 'log2', None]

In [None]:
# Define the grid of hyperparameters to search
hyperparameter_grid = {'loss': loss,
                       'learning_rate':learning_rate,
                       'n_estimators': n_estimators,
                       'max_depth': max_depth,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}


In [None]:
#model for optimization
model = GradientBoostingRegressor()

In [None]:
random_cv = RandomizedSearchCV(estimator=model,
                               param_distributions=hyperparameter_grid,
                               cv=5, n_iter=30, 
                               scoring = 'neg_mean_absolute_error',
                               n_jobs = -1, verbose = 1, 
                               return_train_score = True,
                               random_state=42)

In [None]:
random_cv.fit(X_train,y_train)

In [None]:
# Get all of the cv results and sort by the test performance
random_results = pd.DataFrame(random_cv.cv_results_).sort_values('mean_test_score', ascending = False)
random_results.head(5)

In [None]:
random_cv.best_estimator_

In [None]:
# Create a range of trees to evaluate
trees_grid = {'n_estimators': [100,200,300,400,500, 1500, 2000,2500]}
model =  GradientBoostingRegressor( max_depth =3,
                                   loss='lad',
                                   learning_rate=0.5,
                                  min_samples_split = 6,
                                  max_features = 'log2',
                                  random_state = 42)


In [None]:
# Grid Search Object using the trees range and the random forest model
grid_search = GridSearchCV(estimator = model, param_grid=trees_grid, cv = 5, 
                           scoring = 'neg_mean_absolute_error', verbose = 1,
                           n_jobs = -1, return_train_score = True)

In [None]:
grid_search.fit(X_train,y_train)

In [None]:
# Get the results into a dataframe
results = pd.DataFrame(grid_search.cv_results_)

# Plot the training and testing error vs number of trees
plt.figure(figsize=(8, 8))
plt.style.use('fivethirtyeight')
plt.plot(results['param_n_estimators'], -1 * results['mean_test_score'], label = 'Test_Err')
plt.plot(results['param_n_estimators'], -1 * results['mean_train_score'], label = 'Train_Err')
plt.xlabel('Number of Trees'); plt.ylabel('Mean Abosolute Error'); plt.legend("best");
plt.title('Performance vs Number of Trees');

In [None]:
results.sort_values('mean_test_score', ascending = False).head(5)

In [None]:
#default model
defaultmodelGBR = GradientBoostingRegressor()

In [None]:
# Select the best model
final_modelGBR = grid_search.best_estimator_
final_modelGBR

In [None]:
from pprint import pprint
# Select the best parameters for best estimator
pprint(grid_search.best_estimator_.get_params())

In [None]:
from time import time

In [None]:
%%timeit -n 1 -r 5
defaultmodelGBR.fit(X_train, y_train)

In [None]:
%%timeit -n 1 -r 5
final_modelGBR.fit(X_train, y_train)

In [None]:
default_pred = defaultmodelGBR.predict(X_test)
final_pred = final_modelGBR.predict(X_test)
print('Default model performance on the test set: MAE = %0.2f.' % mae(y_test, default_pred))
print('Final model performance on the test set:   MAE = %0.2f.' % mae(y_test, final_pred))

To get a sense of the predictions, we can plot the distribution of true values on the test set and the predicted values on the test set

In [None]:
plt.figure(figsize=(8, 6)) 

# Density plot of the final predictions and the test values
sns.kdeplot(final_pred, label = 'Predictions')
sns.kdeplot(y_test, label = 'Values')

# Label the plot
plt.xlabel('Time in days'); plt.ylabel('Density');
plt.title('Test Values and Predictions');

The distribution is highly skewed with the density of the predicted values closer to the median of the test values rather than the actual peak. It appears the model might be less accurate at predicting the extreme values and instead predict values further away from the median

Another diagnostic plot is a histogram of the residuals. Ideally, we would hope that the residuals are normally distributed, meaning that the model is wrong the same amount in both directions (high and low)

In [None]:
plt.figure(figsize = (6, 6))

# Calculate the residuals 
residuals = final_pred - y_test

# Plot the residuals in a histogram
plt.hist(residuals, color = 'green', bins = 20,
         edgecolor = 'black')
plt.xlabel('Error'); plt.ylabel('Count')
plt.title('Distribution of Residuals');

The residuals are far from a  normal distribution, with noticeable outliers on both sides of the low and high end. These indicate errors where the model estimate was far below that of the true value

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(final_modelGBR, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

**CONCLUSSIONS:**

**There are a key issues that we draw from the San Francisco Building Permits data:**

**1.Permit type of category 8 which involve alterations in buildings tend to matter the most and normally tend to take the least time literary in hours for the permit to be issued on average, then followed by permit type of category 3**

**It's important to note that some permit types of category 2 takes the most time on average to process upto 3 years.**

**It can be noted that the city's most existing construction type fall in the category of type 5.0 then followed by construction type 1.0.**

# **Finally**
 
# Using the given San Franscisco Building Permits data, a machine learning model can predict the The time taken for a permit to be processed to within 10 points.
#     The five most important variables for determining the Time taken for a permit to be issued being:Permit Type_8,Permit Type Definition_additions alterations or repairs Permit Type, Definition_otc alterations permit,Permit Type_3  and Plansets 