# SF Crime Data Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import eli5
from eli5.sklearn import PermutationImportance
from lightgbm import LGBMClassifier
import datetime

## Import Data

In [None]:
train = pd.read_csv("data/train.csv", parse_dates=['Dates'])
test = pd.read_csv("data/test.csv", parse_dates=['Dates'], index_col='Id')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

## Clean Initial Data

Since we have some columns in `train`, which don't occure in `test`, we should drop them imediatly. Other columns like `DayOfWeek` is not necessary, since we got a date as ID.

In [None]:
train.drop(["Descript", "DayOfWeek", "Resolution"], axis=1, inplace=True)
test.drop(["DayOfWeek"], axis=1, inplace=True)

# Data Insights

In [None]:
train["Category"].value_counts()

In [None]:
data = train.groupby('Category').count().iloc[:, 0].sort_values(
    ascending=False)
data = data.reindex(np.append(np.delete(data.index, 1), 'OTHER OFFENSES'))

plt.figure(figsize=(10, 10))
with sns.axes_style("whitegrid"):
    ax = sns.barplot(
        (data.values / data.values.sum()) * 100,
        data.index,
        orient='h')

plt.title('Incidents per Crime Category')
plt.xlabel('Incidents (%)')

plt.show()

In [None]:
train["PdDistrict"].value_counts()

In [None]:
data = train.groupby('PdDistrict').count().iloc[:, 0].sort_values(
    ascending=False)

plt.figure(figsize=(10, 10))
with sns.axes_style("whitegrid"):
    ax = sns.barplot(
        (data.values / data.values.sum()) * 100,
        data.index,
        orient='h')

plt.title('Incidents per District')
plt.xlabel('Incidents (%)')

plt.show()

# Feature Engineering

To improve our accuracy, we will engineer some new features based on our existing data.

## Date Features

In [None]:
def append_date_feats(data_df):
    data_df["HourOfDay"] = data_df['Dates'].dt.hour
    data_df["MinuteOfHour"] = data_df['Dates'].dt.minute
    data_df["DayOfWeek"] = data_df["Dates"].dt.dayofweek
    data_df["DayOfMonth"] = data_df["Dates"].dt.day
    data_df["Year"] = data_df["Dates"].dt.year
    data_df["MonthOfYear"] = data_df["Dates"].dt.month
    data_df["QuarterOfYear"] = data_df["Dates"].dt.quarter
    #data_df.drop(["Dates"], axis=1, inplace=True)

In [None]:
append_date_feats(train)
append_date_feats(test)

In [None]:
train.head()

In [None]:
sns.displot(data=train[train['Category'] == "PORNOGRAPHY/OBSCENE MAT"], x="HourOfDay", hue="QuarterOfYear", kind="kde")

In [None]:
df_tmp = train.loc[train['Category'].isin(
    ['ROBBERY', 'GAMBLING', 'BURGLARY', 'LARCENY/THEFT', 'PROSTITUTION'])]
sns.displot(data=df_tmp, x="HourOfDay", hue="Category", kind="kde")

## Impute Coordinates

A lot of examples in `train` are not located in San Francisco. We have to impute the values based on the mean of the district.

In [None]:
print(train.loc[train.Y > 50].count()[0])
train.loc[train.Y > 50].sample(5)

In [None]:
train.drop_duplicates(inplace=True)
train.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
test.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)

imp = SimpleImputer(strategy='mean')

for district in train['PdDistrict'].unique():
    for hod in train['HourOfDay'].unique():
        train_by_district = train['PdDistrict'] == district
        train_by_cat = train['HourOfDay'] == hod
        test_by_district = test['PdDistrict'] == district
        test_by_cat = test['HourOfDay'] == hod
        if len(train.loc[train_by_district & train_by_cat]) > 0 and len(test.loc[test_by_district & test_by_cat]) > 0:
            train.loc[train_by_district & train_by_cat, ['X', 'Y']] = imp.fit_transform(train.loc[train_by_district & train_by_cat, ['X', 'Y']])
            test.loc[test_by_district & test_by_cat, ['X', 'Y']] = imp.transform(test.loc[test_by_district & test_by_cat, ['X', 'Y']])

In [None]:
print(train.loc[train.Y > 50].count()[0])

## Address Features

In [None]:
train.head()

In [None]:
train[train['Address'].str.contains('block', case=False)]

In [None]:
train[train['Address'].str.contains('/', case=False)]

In [None]:
train[train['Address'].str.contains('/', case=False)]['Address'].value_counts()[:20]

In [None]:
all_streets = []
for add in train['Address'].values:
    for sub_add in add.split(" / "):
        for sub_sub_add in sub_add.split(" of "):
            all_streets.append(sub_sub_add)

In [None]:
dfStreets = pd.Series(data=all_streets)

In [None]:
dfStreets.value_counts()[:20]

In [None]:
dfStreets[dfStreets.str.contains("Block", case=False)].value_counts()[:10]

In [None]:
dfStreets[dfStreets.str.contains(" ST", case=False)].value_counts()[:10]

In [None]:
dfStreets[dfStreets.str.contains(" AV", case=False)].value_counts()[:10]

In [None]:
dfStreets[dfStreets.str.contains(" WY", case=False)].value_counts()[:10]

In [None]:
dfStreets[dfStreets.str.contains(" TR", case=False)].value_counts()[:10]

In [None]:
dfStreets[dfStreets.str.contains(" DR", case=False)].value_counts()[:10]

In [None]:
def address_feats(data_df):
    data_df["near_BLOCK"] = data_df['Address'].str.contains('block', case=False)
    data_df["near_CROSSING"] = data_df['Address'].str.contains('/')
    data_df["near_ST"] = data_df['Address'].str.contains(' ST')
    data_df["near_AV"] = data_df['Address'].str.contains(' AV')
    data_df["near_WY"] = data_df['Address'].str.contains(' WY')
    data_df["near_DR"] = data_df['Address'].str.contains(' DR')
    data_df["near_TR"] = data_df['Address'].str.contains(' TR')

In [None]:
address_feats(train)
address_feats(test)

In [None]:
train['Category'].value_counts().index[:10]

In [None]:
top_10_crimes = train['Category'].value_counts().index[:10]
df_tmp = train.loc[(train['Category'].isin(top_10_crimes)) & train["Address"].str.contains("400 Block")]
sns.displot(data=df_tmp, x="HourOfDay", hue="Category", kind="kde")
df_tmp = train.loc[(train['Category'].isin(top_10_crimes)) & train["Address"].str.contains("TURNER TR")]
sns.displot(data=df_tmp, x="HourOfDay", hue="Category", kind="kde")

In [None]:
train[train["near_BLOCK"]]["Category"].value_counts()[:10]

In [None]:
train[train["near_AV"]]["Category"].value_counts()[:10]

## Weather Analysis

In [None]:
from meteostat import Daily, Hourly, Stations

In [None]:
start_date_train, end_date_train = train["Dates"].min(), train["Dates"].max()
start_date_test, end_date_test = test["Dates"].min(), test["Dates"].max()
start_date_train, end_date_train

In [None]:
mean_x, mean_y = train["X"].mean(), train["Y"].mean()
mean_x, mean_y

In [None]:
stats = Stations().region("US", "CA").fetch(1000)
stats.head()

In [None]:
stats["X_Diff"] = np.abs(stats["longitude"] - mean_x)
stats["Y_Diff"] = np.abs(stats["latitude"] - mean_y)
stats["XY_Diff"] = stats["X_Diff"] * stats["Y_Diff"]

In [None]:
stats.sort_values(by="XY_Diff")[:10]

In [None]:
sf_station = stats.sort_values(by="XY_Diff").index[2]

In [None]:
sf_station

In [None]:
Hourly.clear_cache()

In [None]:
temp_data_train = Hourly(sf_station, start_date_train, end_date_train)
temp_data_train = temp_data_train.normalize()
temp_data_train = temp_data_train.interpolate()
temp_data_train = temp_data_train.fetch()

In [None]:
temp_data_test = Hourly(sf_station, start_date_test, end_date_test)
temp_data_test = temp_data_test.normalize()
temp_data_test = temp_data_test.interpolate()
temp_data_test = temp_data_test.fetch()

In [None]:
len(temp_data_train)

In [None]:
#temp_data_train = temp_data_train.dropna(how="all")
#temp_data_test = temp_data_test.dropna(how="all")

In [None]:
temp_data_train.iloc[temp_data_train.index.get_loc(train["Dates"][1000],method='nearest')]

In [None]:
temp_fields = ["temp", "dwpt", "prcp", "rhum", "pres", "wspd", "wdir"]

In [None]:
round_by_hour = lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour)
train['Dates_norm'] = train['Dates'].apply(round_by_hour)
test['Dates_norm'] = test['Dates'].apply(round_by_hour)

In [None]:
len(test)

In [None]:
len(pd.merge(test, temp_data_test[temp_fields], left_on="Dates_norm", how="left", right_index=True, ))

In [None]:
train = pd.merge(train, temp_data_train[temp_fields], left_on="Dates_norm", how="left", right_index=True)
test = pd.merge(test, temp_data_test[temp_fields], left_on="Dates_norm", how="left", right_index=True)

In [None]:
train.head()

In [None]:
df_tmp = train.loc[train['Category'].isin(top_10_crimes)]
sns.displot(data=df_tmp, x="wdir", hue="Category", kind="kde")

In [None]:
df_tmp = train.loc[(train['Category'].isin(["PROSTITUTION", "KIDNAPPING", "SUICIDE", "DRUNKENNESS"])) & (train["prcp"] > 0.1)]
sns.jointplot(data=df_tmp, x="temp", y="prcp", kind="kde", hue="Category")

## Sunlight Analysis

In [None]:
from astral.geocoder import database, lookup
from astral.sun import sun

In [None]:
round_by_day = lambda dt: dt.date()
train['Dates_norm'] = train['Dates'].apply(round_by_day)
test['Dates_norm'] = test['Dates'].apply(round_by_day)

In [None]:
min_date = min(train["Dates_norm"].min(), test["Dates_norm"].min())
max_date = max(train["Dates_norm"].max(), test["Dates_norm"].max())

In [None]:
sf = lookup("San Francisco", database())
sun_date_dict = {d.date(): sun(sf.observer, date=d, tzinfo=sf.timezone) for d in pd.date_range(min_date, max_date).to_pydatetime().tolist()}

In [None]:
def map_date_to_sun(a_date):
    sun_info = sun_date_dict[a_date.date()]
    if a_date < sun_info['dawn'].replace(tzinfo=None):
        return "PRE_DAWN"
    elif a_date < sun_info['sunrise'].replace(tzinfo=None):
        return "DAWN_SUNRISE"
    elif a_date < sun_info['noon'].replace(tzinfo=None):
        return "SUNRISE_NOON"
    elif a_date < sun_info['sunset'].replace(tzinfo=None):
        return "NOON_SUNSET"
    elif a_date < sun_info['dusk'].replace(tzinfo=None):
        return "SUNSET_DUSK"
    else:
        return "POST_DUSK"

In [None]:
print(train["Dates"][250])
print(map_date_to_sun(train["Dates"][250]))

In [None]:
train["sun_info"] = train["Dates"].apply(map_date_to_sun)
test["sun_info"] = test["Dates"].apply(map_date_to_sun)

In [None]:
train.head()

In [None]:
df_tmp = train.loc[train['Category'].isin(["PROSTITUTION"])]
sns.displot(data=df_tmp, x="MonthOfYear", hue="sun_info", kind="kde")

# Model Prediction

After engineering some new features, it is time to make some predicitons and train our model.

In [None]:
test.drop(["Address", "Dates_norm", "Dates"], axis=1, inplace=True)
train.drop(["Address", "Dates_norm", "Dates"], axis=1, inplace=True)

In [None]:
test.fillna(0, inplace=True)
train.fillna(0, inplace=True)

In [None]:
train = pd.get_dummies(train, columns=["PdDistrict", "sun_info"])
test = pd.get_dummies(test, columns=["PdDistrict", "sun_info"])

In [None]:
train.info()

In [None]:
test.info()

In [None]:
le_category = LabelEncoder()
y = le_category.fit_transform(train['Category'])

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X = train.drop(["Category"], axis=1)

In [None]:
X = scaler.fit_transform(X)

In [None]:
X.shape

In [None]:
X[0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1337)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
#model = RandomForestClassifier(n_jobs=-1).fit(X_train, y_train)

#model = LGBMClassifier(objective='multiclass', num_class=39).fit(X_train, y_train)

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
mlp = MLPClassifier(random_state=1, hidden_layer_sizes=15, verbose=True)

In [None]:
mlp.fit(X_train, y_train)

In [None]:
model = mlp

In [None]:
hgbc = HistGradientBoostingClassifier() # 26%

In [None]:
hgbc.fit(X_train, y_train)

In [None]:
model = hgbc

In [None]:
#clf1 = HistGradientBoostingClassifier() # 26%
#clf2 = RandomForestClassifier() # 25%
#clf3 = LGBMClassifier(objective='multiclass', num_class=39) # 27%

In [None]:
abc = AdaBoostClassifier(random_state=0)

In [None]:
abc.fit(X_train, y_train)

In [None]:
model = abc

In [None]:
model = VotingClassifier(estimators=[('hgbc', hgbc), ('mlp', mlp), ("abc", abc)], voting='soft', )

In [None]:
model.fit(X_train, y_train)

In [None]:
predict_y = model.predict_proba(scaler.transform(test))

In [None]:
predict_y.shape

## Export Submissions File

In [None]:
df_sub = pd.DataFrame(predict_y, columns=le_category.classes_)
df_sub['Id'] = test.index
df_sub = df_sub.round(4)

In [None]:
df_sub.to_csv('submission_voting.csv', index=False)

## Eval Results

In [None]:
from sklearn.metrics import classification_report

predict_y_t = model.predict(X_test)
print(classification_report(y_test, predict_y_t, target_names=le_category.inverse_transform(np.arange(39)), labels=np.arange(39)))

In [None]:
perm = PermutationImportance(model).fit(X_test[:10000], y_test[:10000])
eli5.show_weights(perm, feature_names=train.drop("Category", axis=1).columns.tolist())

In [None]:
eli5.show_weights(perm, feature_names=train.drop("Category", axis=1).columns.tolist(), top=100)

In [None]:
data_for_prediction = test.iloc[8262]
data_for_prediction

In [None]:
import shap

In [None]:
shap.initjs()

# Create object that can calculate shap values
explainer = shap.TreeExplainer(model)

# Calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

shap.force_plot(explainer.expected_value[4], shap_values[4], data_for_prediction, link='logit')