In [None]:
import os
import pandas as pd

In [None]:
nRowsRead = None

## Import Data

In [None]:
df = pd.read_csv('../input/crimes-new-york-city/NYPD_Complaint_Data_Historic.csv', delimiter=',', dtype={'CMPLNT_NUM' : 'int64', 'CMPLNT_FR_DT' : 'string', 'CMPLNT_FR_TM' : 'string', 'CMPLNT_TO_DT' : 'string', 'CMPLNT_TO_TM' : 'string', 'ADDR_PCT_CD' : 'string', 'RPT_DT' : 'string', 'KY_CD' : 'int64', 'OFNS_DESC' : 'string', 'PD_CD' : 'float64', 'PD_DESC' : 'string', 'CRM_ATPT_CPTD_CD' : 'string', 'LAW_CAT_CD' : 'string', 'BORO_NM' : 'string', 'LOC_OF_OCCUR_DESC' : 'string', 'PREM_TYP_DESC' : 'string', 'JURIS_DESC' : 'string', 'JURISDICTION_CODE' : 'float64', 'PARKS_NM' : 'string', 'HADEVELOPT' : 'string', 'HOUSING_PSA' : 'string', 'X_COORD_CD' : 'float64', 'Y_COORD_CD' : 'float64', 'SUSP_AGE_GROUP' : 'string', 'SUSP_RACE' : 'string', 'SUSP_SEX' : 'string', 'TRANSIT_DISTRICT' : 'string', 'Latitude' : 'float64', 'Longitude' : 'float64', 'Lat_Lon' : 'string', 'PATROL_BORO' : 'string', 'STATION_NAME' : 'string', 'VIC_AGE_GROUP' : 'string', 'VIC_RACE' : 'string', 'VIC_SEX' : 'string'}, nrows=nRowsRead)
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')


In [None]:
df_backup = df
#df = df_backup

In [None]:
df.info()

In [None]:
df.count()

## Clean Data

In [None]:
columns_remove = ['PREM_TYP_DESC','LOC_OF_OCCUR_DESC','CRM_ATPT_CPTD_CD','LAW_CAT_CD','ADDR_PCT_CD','PD_CD','PD_DESC', 'CMPLNT_NUM', 'OFNS_DESC', 'JURIS_DESC', 'JURISDICTION_CODE', 'PATROL_BORO', 'CMPLNT_TO_DT', 'CMPLNT_TO_TM', 'HADEVELOPT', 'HOUSING_PSA', 'PARKS_NM', 'RPT_DT', 'STATION_NAME', 'TRANSIT_DISTRICT', 'X_COORD_CD','Y_COORD_CD', 'Lat_Lon', 'SUSP_AGE_GROUP', 'VIC_AGE_GROUP', 'SUSP_SEX', 'SUSP_RACE']
df = df.drop(columns_remove, axis=1)

In [None]:
df.info()

In [None]:
df.dropna(subset=['CMPLNT_FR_DT'], inplace=True)
df.dropna(subset=['CMPLNT_FR_TM'], inplace=True)

In [None]:
import numpy as np

In [None]:
df.replace('UNKNOWN', np.NaN, inplace=True)
df.replace('E', np.NaN, inplace=True)
df.replace('D', np.NaN, inplace=True)
df.replace('U', np.NaN, inplace=True)

In [None]:
print('Number of rows before removing rows with missing values: ' + str(df.shape[0]))
df.dropna(axis=0, inplace=True)
print('Number of rows after removing rows with missing values: ' + str(df.shape[0]))

In [None]:
import datetime

In [None]:
df['CMPLNT_FR_YEAR'] = df['CMPLNT_FR_DT'].map(lambda x: int(str(x).split('/')[2]))
df['CMPLNT_FR_MONTH'] = df['CMPLNT_FR_DT'].map(lambda x: int(x.split('/')[0]))
df['CMPLNT_FR_DAY'] = df['CMPLNT_FR_DT'].map(lambda x: int(x.split('/')[1]))
df['CMPLNT_FR_HOUR'] = df['CMPLNT_FR_TM'].map(lambda x: int(x.split(':')[0]))

In [None]:
df['CMPLNT_FR_DAY'] = df['CMPLNT_FR_DT'].map(lambda x: int(datetime.date(int(str(x).split('/')[2]),int(x.split('/')[0]),int(x.split('/')[1])).weekday()))

In [None]:
pd.unique(df.CMPLNT_FR_YEAR)

In [None]:
columns_remove = ['CMPLNT_FR_TM', 'CMPLNT_FR_DT','CMPLNT_FR_YEAR']
df = df.drop(columns_remove, axis=1)

In [None]:
df.info()

### Filter by Year > 2015

In [None]:
df = df[df['CMPLNT_FR_YEAR'] > 2015]

In [None]:
pd.unique(df.CMPLNT_FR_YEAR)

In [None]:
pd.unique(df['KY_CD'])

In [None]:
def ky_cat(ky_cd):
    if ky_cd in [101,102,103]:
        return "HOMICIDE"
    elif ky_cd in [104,115,116,233,234,356,460]:
        return "SEXCRIME"
    elif ky_cd in [105,107,109,110,111,112,113,231,238,340,341,342,343,358]:
        return "THEFTFRAUD"
    elif ky_cd in [106,114,124,344]:
        return "OTHERVIOLENT"
    elif ky_cd in [117,118,119,232,235,236,346,347,577]:
        return "DRUGS"
    elif ky_cd in [120, 121, 125, 126, 345, 345, 348, 349, 351, 352, 353, 354, 355, 357, 359, 360, 361, 362, 363, 364, 364, 364, 365, 366, 455, 571, 572, 578, 672, 675, 676, 677, 677, 678, 685, 881] :
        return "OTHER"
df['KY_CD'] = df['KY_CD'].map(lambda x: ky_cat(x))

In [None]:
dum_df = pd.get_dummies(df, columns=["KY_CD","BORO_NM","VIC_SEX","VIC_RACE"], prefix=["KY_CD","BORO_NM","VIC_SEX","VIC_RACE"] )

In [None]:
dum_df

## Data Stats

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
sns.set_style("whitegrid")

### Victim/Suspect Correlation

In [None]:
#cross_table = df.groupby(['SUSP_RACE', 'VIC_RACE'])['KY_CD'].count()

In [None]:
#cross_table = cross_table.divide(cross_table.sum(), axis=0).unstack(level=0)

In [None]:
#fig, ax = plt.subplots(1, 1, figsize=(10, 10))
#sns.heatmap(cross_table, cmap='RdBu_r', ax=ax,square=True,vmin=0, vmax=0.666,annot=True,fmt='.3f',)

In [None]:
colors = np.random.rand(len(df['Longitude']))

plt.figure(figsize=(20,20))
plt.scatter(df['Longitude'], df['Latitude'],c=colors, alpha=0.5)
plt.show()

### Crimes per Year

In [None]:
sns.countplot(x='CMPLNT_FR_YEAR', data=df)

In [None]:
#pd.unique(df.LAW_CAT_CD)

### Felonies per Year

In [None]:
#sns.countplot(x='CMPLNT_FR_YEAR', data=df[df['LAW_CAT_CD'] == 0])

### Violations per Year

In [None]:
#sns.countplot(x='CMPLNT_FR_YEAR', data=df[df['LAW_CAT_CD'] == 2])

### Misdemeanors per Year

In [None]:
#sns.countplot(x='CMPLNT_FR_YEAR', data=df[df['LAW_CAT_CD'] == 1])

### Correlation

In [None]:
corr = df.apply(lambda x : pd.factorize(x)[0]).corr()
plt.figure(figsize = (40,40))
sns_plot = sns.heatmap(corr, cmap = "coolwarm", linewidth = 2, linecolor = "white", annot=True,vmin=-1, vmax=1, center=0, square=True)

In [None]:
df.head()

In [None]:
dum_df.head()

In [None]:
# Get the feature vector
X = dum_df.drop(['KY_CD_DRUGS', 'KY_CD_HOMICIDE', 'KY_CD_OTHER', 'KY_CD_OTHERVIOLENT', 'KY_CD_SEXCRIME', 'KY_CD_THEFTFRAUD'], axis = 1)

# Get the target vector
y = dum_df[['KY_CD_DRUGS', 'KY_CD_HOMICIDE', 'KY_CD_OTHER', 'KY_CD_OTHERVIOLENT', 'KY_CD_SEXCRIME', 'KY_CD_THEFTFRAUD']]

print('X shape: ' + str(X.shape))
print('y shape: ' + str(y.shape))

In [None]:
from sklearn.model_selection import train_test_split

# Randomly choose 30% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

# Show the shape of the data
print('y train shape: ' + str(np.unique(y_train, return_counts=True)))
print('y test shape: ' + str(np.unique(y_test, return_counts=True)))

In [None]:
# The key is the classifier acronym and the value is the classifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Note: There is no class_weight parameter for MLP, so it was left out
clfs = {'lr': LogisticRegression(random_state=0, max_iter=4000, class_weight='balanced'),
        'mlp': MLPClassifier(random_state=0),
        'dt': DecisionTreeClassifier(random_state=0, class_weight='balanced'),
        'rf': RandomForestClassifier(random_state=0, class_weight='balanced')}

In [None]:
# The key is the classifier acronym and the value is the pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe_clfs = {}

for name, clf in clfs.items():
    pipe_clfs[name] = Pipeline([('StandardScaler', StandardScaler()), ('clf', clf)])

In [None]:
# The key is the classifier acronym and the value is the parameter grid of the classifier
param_grids = {}

In [None]:
# Set C range
C_range = [10 ** i for i in range(-4, 5)]

# Create parameter grid for Logistic Regression
# Hyper parameters being tunes are multi_class, solver, C
param_grid = [{'clf__multi_class': ['multinomial'],
               'clf__solver': ['newton-cg', 'lbfgs'],
               'clf__C': C_range}]
param_grids['lr'] = param_grid

# Create parameter grid for Multi-Layer Perceptron
    # Hyper parameters being tunes are hidden_layer_sizes, activation
param_grid = [{'clf__hidden_layer_sizes': [10],
               'clf__activation': ['tanh', 'relu']}]
param_grids['mlp'] = param_grid

# Create parameter grid for Decision Tree
# Hyper parameters being tunes are min_samples_split, min_samples_leaf
param_grid = [{'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]
param_grids['dt'] = param_grid

# Create parameter grid for Random Forest
# Hyper parameters being tunes are n_estimators, min_samples_split, min_samples_leaf
param_grid = [{'clf__n_estimators': [2, 10, 30],
               'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]
param_grids['rf'] = param_grid

In [None]:
y_train.astype


In [None]:
from sklearn.utils.multiclass import type_of_target
print(type_of_target(y_train.idxmax(axis=1)))
print(type_of_target(y_train))

In [None]:
y_train.idxmax(axis=1)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# List of [best_score_, best_params_, best_estimator_]
best_score_param_estimators = []

# Use GridSearchCV on each classifier
for name in pipe_clfs.keys():
    gs = GridSearchCV(estimator=pipe_clfs[name],
                      param_grid=param_grids[name],
                      scoring='accuracy',
                      n_jobs=1,
                      verbose = 3)
    
    # Fit the pipeline
    gs = gs.fit(X_train, y_train.idxmax(axis=1))
    
    # Update best_score_param_estimators
    best_score_param_estimators.append([gs.best_score_, gs.best_params_, gs.best_estimator_])

In [None]:
# Sort best_score_param_estimators in ascending order of best_score_
best_score_param_estimators = sorted(best_score_param_estimators, key=lambda x : x[0], reverse=True)

# For each [best_score_, best_params_, best_estimator_], Print out [best_score_, best_params_, best_estimator_], 
# where best_estimator_ is a pipeline
for best_score_param_estimator in best_score_param_estimators:
    print([best_score_param_estimator[0], best_score_param_estimator[1], 
           type(best_score_param_estimator[2].named_steps['clf'])], end='\n\n')

In [None]:
X_test

In [None]:
from sklearn.metrics import precision_recall_fscore_support

y_pred = best_score_param_estimators[0][2].predict(X_test)

print('Classifier:', end=' ')
print(type(best_score_param_estimators[0][2].named_steps['clf']), end='\n\n')
print('Accuracy:', end=' ')
print(precision_recall_fscore_support(y_pred, y_test, average='micro')[0])

In [None]:
df.info()


In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau,ModelCheckpoint
MCP = ModelCheckpoint('Best_points.h5',verbose=1,save_best_only=True,monitor='val_accuracy',mode='max')
ES = EarlyStopping(monitor='val_accuracy',min_delta=0,verbose=1,restore_best_weights = True,patience=3,mode='max')
RLP = ReduceLROnPlateau(monitor='val_loss',patience=3,factor=0.2,min_lr=0.0001)

In [None]:
int(X_train.shape[0]/5)


In [None]:
y_train.shape

In [None]:
# NOTE: must pip install keras and tensorflow if using Anaconda distribution
from keras.layers.core import Dense, Dropout
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization,Conv1D,MaxPooling1D,Flatten
import keras
LR = 0.0001
# Create model

n_timesteps, n_features, n_outputs = X_train.shape[0], X_train.shape[1], y_train.shape[1]

model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu',input_shape=(n_timesteps,n_features)))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
 
model.add(Dense(6, activation='softmax'))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit model to training data
model.fit(X_train, y_train, epochs=10, batch_size=64,callbacks=[MCP,ES,RLP],validation_data=(X_test, y_test))

# Evaluate model on test data
scores = model.evaluate(X_test, y_test)
print("\n%s: %.14f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
print(model.summary())

In [None]:
model.save('model')

In [None]:
X_test.dtypes

In [None]:
X_test.head(1).to_numpy()

In [None]:
model.predict([[ 300,0,1,1,9,0,8,20.6914731,-73.9277662,2,1,2020,2,1,22]])

In [None]:
y_train.value_counts()