# Importing required libraries

In [4]:
# Data Wrangling
import pandas as pd
import numpy as np 
import time
import datetime

# Machine Learning - Model Development
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers

------

## (A) Data Cleaning

### (A.1) Extract Dataset

In [5]:
# Extract dataset
raw_df = pd.read_csv(r"C:\Users\sci12\Documents\LSE Work\Year 3\NYPD_Complaint_Data_Historic_20231013.csv",low_memory=False)
data_dictionary_df = pd.read_csv(r"C:\Users\sci12\Documents\LSE Work\Year 3\NYPD_Complaint_Historic_DataDictionary.csv",encoding='ISO 8859-1',low_memory=False)

# Cleaning & Reformatting Data Dictionary
data_dictionary_df.columns = data_dictionary_df.iloc[0]
data_dictionary_df = data_dictionary_df[data_dictionary_df.columns[0:2]][1:]


### (A.2) Handling & dealing with Missing Data

In [6]:
# Replace all Nulls with NA
new_df = raw_df.replace(to_replace=["(null)"],value=np.NaN)

# View Missing Data Percentage by Column 
missing_column_df = new_df.isna().mean().sort_values(ascending=False).reset_index()
missing_column_df.columns = ["Column Name","Missing %"]
missing_column_df.merge(data_dictionary_df, on='Column Name', how='left')


Unnamed: 0,Column Name,Missing %,Column Description
0,HADEVELOPT,0.996427,Name of NYCHA housing development of occurrenc...
1,PARKS_NM,0.995902,"Name of NYC park, playground or greenspace of ..."
2,TRANSIT_DISTRICT,0.978055,Transit district in which the offense occurred.
3,STATION_NAME,0.978055,Transit station name
4,HOUSING_PSA,0.92415,Development Level Code
5,SUSP_AGE_GROUP,0.539357,Suspects Age Group
6,SUSP_SEX,0.448012,Suspects Sex Description
7,SUSP_RACE,0.432053,Suspects Race Description
8,CMPLNT_TO_DT,0.2129,Ending date of occurrence for the reported eve...
9,CMPLNT_TO_TM,0.212298,Ending time of occurrence for the reported eve...


### (A.3) Understanding Dataset

In [7]:
# Understanding Dataset
print(f"Observations: {new_df.shape[0]}")
print(f"Variables: {new_df.shape[1]}")

# View Cleaned Dataset
new_df.head() 

Observations: 8353049
Variables: 35


Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,...,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,10600119,01/01/2002,11:00:00,02/25/2006,11:00:00,,03/06/2006,104,RAPE,157.0,...,M,,,,,,,18-24,WHITE,F
1,11052575,04/22/2005,02:00:00,,,,04/22/2006,110,GRAND LARCENY OF MOTOR VEHICLE,441.0,...,,,,,,,,25-44,BLACK,M
2,10832306,07/11/2005,20:00:00,03/29/2006,12:00:00,,03/29/2006,353,UNAUTHORIZED USE OF A VEHICLE,462.0,...,M,,,,,,,45-64,WHITE,M
3,10107192,10/19/2005,20:30:00,,,,01/13/2006,126,MISCELLANEOUS PENAL LAW,461.0,...,F,,,,,,,25-44,BLACK,F
4,23893731,12/04/2005,09:00:00,08/09/2006,17:00:00,,08/09/2006,109,GRAND LARCENY,405.0,...,,,,,,,,25-44,WHITE,F


### (A.4) Cleaning all Rows


In [8]:
# Removing Unused Features
cleaned_df = new_df[["VIC_AGE_GROUP","VIC_SEX","VIC_RACE","CMPLNT_FR_TM","Latitude","Longitude","OFNS_DESC","LAW_CAT_CD"]]

# Filling in Missing Values
columns_to_fill_na = ["VIC_AGE_GROUP","VIC_SEX","VIC_RACE"]
for column in columns_to_fill_na:
    cleaned_df[f"{column}"] = cleaned_df[f"{column}"].fillna("UNKONWN")

# Removing Missing Rows in Labels
cleaned_df = cleaned_df.dropna(axis=0)

# Cleaning VIC_AGE_GROUP - Age
age_group_patterns = ['18-24', '25-44', '45-64', '65+', '<18', 'UNKNOWN']
cleaned_df = cleaned_df[cleaned_df["VIC_AGE_GROUP"].isin(age_group_patterns)]
age_mapping_df = pd.DataFrame({"VIC_AGE_GROUP": sorted(cleaned_df["VIC_AGE_GROUP"].unique().tolist()),
                               "VIC_AGE_GROUP_NUM":[i for i in range(cleaned_df["VIC_AGE_GROUP"].nunique())]})
cleaned_df = pd.merge(cleaned_df, age_mapping_df, on='VIC_AGE_GROUP', how='left')

# Cleaning CMPLNT_FR_TM & CMPLNT_HR - Time & Hour
cleaned_df["CMPLNT_FR_TM"] = pd.to_datetime(cleaned_df["CMPLNT_FR_TM"], format='%H:%M:%S')
cleaned_df["CMPLNT_HR"] = pd.to_datetime(cleaned_df["CMPLNT_FR_TM"], format='%H:%M:%S').dt.hour
cleaned_df["CMPLNT_MIN"] = pd.to_datetime(cleaned_df["CMPLNT_FR_TM"], format='%H:%M:%S').dt.minute

# Cleaning VIC_SEX - Gender
gender_map = {"F":"FEMALE", "M":"MALE", "D":"DECLINE TO STATE", "E":"NON-BINARY/OTHER","U":"UNKNOWN", "L":"LGBTQ+"}
cleaned_df["VIC_SEX"] = cleaned_df["VIC_SEX"].map(gender_map)

# Cleaning LAW_CAT_CD - Crime Levels
crime_level_map = {"FELONY": 3,"MISDEMEANOR": 2,"VIOLATION": 1}
cleaned_df["LAW_CAT_CD"] = cleaned_df["LAW_CAT_CD"].map(crime_level_map)

# Cleaning OFNS_DESC - Offenses

# Function to apply the replacements
def replace_text(text):
    for pattern in ['LARCENY','ASSAULT','MURDER','HARRASSMENT','FRAUD','OFFENSE', 'ESCAPE']:
        if pattern in text:
            text = pattern
    return text

# Replace Duplicated & Weird Names
cleaned_df["OFNS_DESC"] = cleaned_df["OFNS_DESC"].replace({"OFF. AGNST PUB ORD SENSBLTY &.":"OFFENSE",
                                 "INTOXICATED & IMPAIRED DRIVING":"INTOXICATED/IMPAIRED DRIVING",
                                 "ADMINISTRATIVE CODES":"ADMINISTRATIVE CODE"})

# Apply the function to the Series
cleaned_df["OFNS_DESC"] = cleaned_df["OFNS_DESC"].apply(replace_text)

# Cleaning OFNS_DESC - Numerical Conversion
offense_mapping_df = pd.DataFrame({"OFNS_DESC": sorted(cleaned_df["OFNS_DESC"].unique().tolist()),
                                   "OFNS_DESC_NUM":[i for i in range(cleaned_df["OFNS_DESC"].nunique())]})
cleaned_df = pd.merge(cleaned_df, offense_mapping_df, on='OFNS_DESC', how='left')

# Reset index
cleaned_df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df[f"{column}"] = cleaned_df[f"{column}"].fillna("UNKONWN")


### (A.5) View Final Cleaned Dataset

In [9]:
# Understanding Dataset
print(f"Remaining Observations: {cleaned_df.shape[0]}")
print(f"Remaining Variables: {cleaned_df.shape[1]}")

# View Cleaned Dataset
cleaned_df.head() 


Remaining Observations: 6721798
Remaining Variables: 12


Unnamed: 0,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,CMPLNT_FR_TM,Latitude,Longitude,OFNS_DESC,LAW_CAT_CD,VIC_AGE_GROUP_NUM,CMPLNT_HR,CMPLNT_MIN,OFNS_DESC_NUM
0,45-64,FEMALE,BLACK,1900-01-01 16:35:00,40.689001,-73.945027,ASSAULT,2,2,16,35,5
1,18-24,FEMALE,BLACK,1900-01-01 17:15:00,40.698474,-73.917769,HARRASSMENT,1,0,17,15,22
2,<18,FEMALE,BLACK,1900-01-01 00:01:00,40.86947,-73.879861,RAPE,3,4,0,1,44
3,<18,FEMALE,BLACK,1900-01-01 19:00:00,40.837842,-73.919628,RAPE,3,4,19,0,44
4,<18,FEMALE,BLACK,1900-01-01 05:00:00,40.810352,-73.924942,RAPE,3,4,5,0,44


## Count Plots

In [7]:
# offense_mean_df = cleaned_df[["Latitude","Longitude","CMPLNT_FR_TM","LAW_CAT_CD"]].groupby(["Latitude","Longitude","CMPLNT_FR_TM"])["LAW_CAT_CD"].mean().to_frame().reset_index()
# top_3_offense_df = cleaned_df[["Latitude","Longitude","CMPLNT_FR_TM","OFNS_DESC"]].groupby(["Latitude","Longitude"])['OFNS_DESC'].apply(lambda x: x.value_counts().index[:3].tolist()).reset_index()
# offense_size_df = cleaned_df[["Latitude","Longitude","CMPLNT_FR_TM","LAW_CAT_CD"]].groupby(["Latitude","Longitude","CMPLNT_FR_TM"])["LAW_CAT_CD"].mean().to_frame().reset_index()
# bubble_plot_df = 

## () Model Building

In [52]:
import pickle
# Specify the file path where you want to save the model
file_path = 'xgboost.pkl'

# Open the file in binary write mode and save the model
with open(file_path, 'wb') as file:
    pickle.dump(clf, file)

### fsdfsf

In [47]:
# Defining Variables and Labels
X = cleaned_df[["VIC_AGE_GROUP_NUM","VIC_SEX","VIC_RACE","CMPLNT_HR","CMPLNT_MIN","Latitude","Longitude"]]
y = cleaned_df["OFNS_DESC_NUM"]

# Encoding Categorical Features (e.g., using one-hot encoding)
X = pd.get_dummies(X, columns=["VIC_AGE_GROUP_NUM","VIC_SEX","VIC_RACE"])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train an XGBoost classifier
clf = xgb.XGBClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy and generate a classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the model's accuracy and classification report
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.38148710166919575
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.20      0.00      0.00      2933
           2       1.00      0.01      0.01       187
           3       0.22      0.03      0.05       140
           4       0.20      0.00      0.00      1399
           5       0.34      0.53      0.41    232750
           6       0.00      0.00      0.00       492
           7       0.35      0.03      0.06     45506
           8       0.00      0.00      0.00        77
           9       0.42      0.07      0.12    131079
          10       0.00      0.00      0.00      6465
          11       0.37      0.78      0.50     21295
          12       0.25      0.09      0.14     12604
          13       0.00      0.00      0.00        69
          14       0.00      0.00      0.00         6
          15       0.00      0.00      0.00        60
          16       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


### sdf

In [None]:
# # Use the trained model to predict crime probabilities for a new input
# new_input = pd.DataFrame(data={'Age': [25], 'Sex_Male': [1], 'Location_Suburban': [1], 'Time_Night': [1]})
# crime_probabilities = clf.predict_proba(new_input)
# print("Predicted Crime Probabilities:")
# for crime, prob in zip(clf.classes_, crime_probabilities[0]):
#     print(f"{crime}: {prob * 100:.2f}%")

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers

# Define your features (X) and target variable (y)
X = cleaned_df[["VIC_AGE_GROUP","VIC_SEX","VIC_RACE","CMPLNT_HR","CMPLNT_MIN","Latitude","Longitude"]]
y = cleaned_df["OFNS_DESC"]

# Encode categorical features
label_encoders = {}
categorical_columns = ["VIC_AGE_GROUP","VIC_SEX","VIC_RACE"]
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    X[column] = label_encoders[column].fit_transform(X[column])

# One-hot encode the target variable
y = pd.get_dummies(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the neural network model
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(y.columns), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Make predictions on the test set
y_pred = model.predict(X_test)

# # Get the top 3 crimes and their corresponding probabilities for a user input
# user_input = np.array([[25, 'Suburban', 'Night']])  # Replace with user's input
# user_input[:, 1] = label_encoders['Location'].transform(user_input[:, 1])
# user_pred = model.predict(user_input)

# Calculate accuracy and generate a classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the model's accuracy and classification report
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

# # Sort and get the top 3 crimes and their probabilities
# top_crimes = np.argsort(user_pred)[:, -3:][0][::-1]
# top_probabilities = user_pred[0][top_crimes]

# # Decode the top crimes back to their original labels
# top_crimes = [y.columns[i] for i in top_crimes]

# # Print the top 3 crimes and their probabilities
# for crime, probability in zip(top_crimes, top_probabilities):
#     print(f'Crime: {crime}, Probability: {probability:.2f}')

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'

In [12]:
cleaned_df.to_csv("cleaned_df.csv")