In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime,timedelta
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'
plt.style.use('bmh')

In [None]:
dataDSNY = pd.read_csv('/kaggle/input/dsny-20152017/311-DSNY-20151017.csv')
df = dataDSNY


# Data size
print(df.shape)
print(df.columns)

In [None]:
df.info()

From above informations we can already see that some features won't be relevant in our exploratory analysis as there are too much missing values (such as LAndmark, Vehicle Type, Road Ramp,Taxi Pick Up Location,Taxi Company Borough ...). It is better to concentrate on the features which can give us real insights. Let's just remove Unique Key and the features with 30% or less NaN values.

In [None]:
df.describe()

In [None]:
# df.count() does not include NaN values
df2 = df[[column for column in df if df[column].count() / len(df) >= 0.3]]
del df2['Unique Key']
print("List of dropped columns:", end=" ")
for c in df.columns:
    if c not in df2.columns:
        print(c, end=", ")
print('\n')
df = df2

Checking the possibility of missing Created Dates:

In [None]:
print(df['Created Date'].isnull().sum())

Calculate the time difference between the close data and created date for each incident in minutes to be able to predict this time using other features. 

In [None]:
createdDate = df['Created Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M:%S %p') if type(x)==str else np.NaN)
closedDate = df['Closed Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M:%S %p') if type(x)==str else np.NaN)
deltaTime = pd.to_datetime(closedDate, errors = 'coerce')-pd.to_datetime(createdDate, errors = 'coerce')
df['minDifference'] = deltaTime.abs().astype('timedelta64[m]')

In [None]:
# Calculate the minutes
df['minDifference'] = deltaTime.abs().dt.total_seconds()/60

Remove Nans from dependent variable

In [None]:
print(df['minDifference'].isnull().sum())


Check to see if there are missing closing dates and it means these cases are still open:

In [None]:
print(df['Closed Date'].isnull().sum())
print(df['minDifference'].max())
print(df['minDifference'].min())

df['minDifference'].replace({0: 11})
print(df['minDifference'].min())

Replace the missing time difference values with a large number to predict the cases that are still open

In [None]:
df['minDifference'].fillna(10000000,inplace = True)

df['minDifference'].isnull().sum()

Transformation to scale the output values

In [None]:
# Histogram of time difference 
df['logTime'] = np.log(df['minDifference'].replace(0, np.nan))
plt.hist(df['logTime'])
plt.title('Frequency Distribution of closing time')
plt.ylabel('Number', fontsize=12)
plt.xlabel('log time difference', fontsize=12)
plt.show()

Month and year of the created date should be two important features for the modeling. We extracted them here. 

In [None]:
df['year'] = pd.to_datetime(createdDate, errors = 'coerce').dt.year.astype(int)
df['month'] = pd.to_datetime(createdDate, errors = 'coerce').dt.month.astype(int)

In [None]:
cityCount = df['City'].value_counts()
sns.set(style="darkgrid")
sns.barplot(cityCount.index, cityCount.values, alpha=0.9)
plt.title('Frequency Distribution of City')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('City', fontsize=12)
plt.show()

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

 Label Encoding for Categorical Variables that should be important features as input to the predictive model. 

In [None]:
df['City'] = df['City'].astype('category')
df['City_code'] = df['City'].cat.codes

df['Complaint Type'] = df['Complaint Type'].astype('category')
df['Complaint Type_code'] = df['Complaint Type'].cat.codes

df['Location'] = df['Location'].astype('category')
df['Location_code'] = df['Location'].cat.codes

df['Community Board'] = df['Community Board'].astype('category')
df['Community Board_code'] = df['Community Board'].cat.codes



df['Agency Name'] = df['Agency Name'].astype('category')
df['Agency Name_code'] = df['Agency Name'].cat.codes

df['Open Data Channel Type'] = df['Open Data Channel Type'].astype('category')
df['Open Data Channel Type_code'] = df['Open Data Channel Type'].cat.codes

df['Borough'] = df['Borough'].astype('category')
df['Borough_code'] = df['Borough'].cat.codes

df['Park Borough'] = df['Park Borough'].astype('category')
df['Park Borough_code'] = df['Park Borough'].cat.codes


Defining the input and output variables for the modeling

In [None]:
# columns of the final dataframe
df.columns

In [None]:
Features = ['year', 'month', 'City_code',
       'Complaint Type_code', 'Location_code', 'Community Board_code',
       'Borough_code', 'Park Borough_code', 'Agency Name_code',
       'Open Data Channel Type_code']
X = df[Features]
y = df[['logTime']]

Impute missing data for modeling

In [None]:
#Impute missing data
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean',verbose=0)
imputer = imputer.fit(y)
y = imputer.transform(y)

Deviding data to training and test sets for testing the performances of several modeling techniques

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
X_train.describe()

In [None]:

# Linear Regression

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)


print('Coefficients: \n', regr.coef_)
# The mean squared error

print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

# Plot outputs
plt.scatter(y_test, y_pred,  color='black')
plt.xticks(())
plt.yticks(())
plt.show()


In [None]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)

In [None]:
# identify outliers in the training dataset (Second method)
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)


In [None]:
# select all rows that are not outliers
mask = yhat != -1
print(len(mask))
print(len(X_train))

In [None]:
y_train = y_train[mask]
X_train = X_train[mask]

In [None]:
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

In [None]:
# Plot outputs
plt.scatter(y_test, yhat,  color='blue')
plt.xticks(())
plt.yticks(())
plt.show()

Outlier removal was not helpful for linear modeling

Linear regression doesn't show a good performance for this model. So, let's try Random Forest as a non linear regression technique 

In [None]:
# RAndorm Forest Regression
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train,y_train)

In [None]:
# Use the forest's predict method on the test data
predictions = regressor.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'min.')


In [None]:

importance = regressor.feature_importances_
importance = pd.DataFrame(importance, index=pd.Index(Features),columns=["Importance"])
importance["Std"] = numpy.std([tree.feature_importances_
                            for tree in clf_RF.estimators_], axis=0)

#x = range(importance.shape[0])
#    x = df.columns
x = pd.Index(features)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]
plt.figure(figsize=(10,7))
plt.barh(x, y, align="center")
plt.gca().invert_yaxis()
plt.title('Feature importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()
