In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [57]:
#Load dataset from ../raw_data/
df = pd.read_csv("../raw_data/preprocessed_chicago.csv")
df.head(5)

Unnamed: 0,WARD,TIME ENCODED,MONTH_SIN,MONTH_COS,WEEKEND,DATE OF OCCURRENCE,OFFENSES,LATITUDE,LONGITUDE
0,35,6,0.5,0.8660254,0,2024-01-16 01:00:00,THEFT OVER $500,41.931844,-87.722951
1,42,4,-2.449294e-16,1.0,1,2023-12-31 16:30:00,BATTERY,41.888994,-87.626935
2,16,3,0.5,0.8660254,1,2024-01-06 12:50:00,DECEPTIVE PRACTICE,41.793299,-87.664566
3,1,3,0.8660254,-0.5,1,2024-04-07 13:56:00,THEFT OVER $500,41.906797,-87.671862
4,49,4,1.0,6.123234000000001e-17,0,2024-03-22 15:30:00,THEFT UNDER $500,42.007825,-87.670842


In [58]:
df['OFFENSES'].value_counts()

OFFENSES
BATTERY                45766
THEFT OVER $500        31671
CRIMINAL DAMAGE        28638
THEFT UNDER $500       28400
ASSAULT                23508
MOTOR VEHICLE THEFT    22331
OTHER OFFENSE          16773
DECEPTIVE PRACTICE     15059
Name: count, dtype: int64

In [59]:
# Preprocessing
# Encode the target variable
label_encoder = LabelEncoder()
df['OFFENSES'] = label_encoder.fit_transform(df['OFFENSES'])

# Convert date to datetime and extract features like day of week
df['DATE OF OCCURRENCE'] = pd.to_datetime(df['DATE OF OCCURRENCE'])
df['DAY_OF_WEEK'] = df['DATE OF OCCURRENCE'].dt.dayofweek

In [60]:
df['OFFENSES'].value_counts()

OFFENSES
1    45766
6    31671
2    28638
7    28400
0    23508
4    22331
5    16773
3    15059
Name: count, dtype: int64

SVM:

In [61]:
# Take a smaller sample for testing
df_sample = df.sample(n=10, random_state=42)  # Adjust n for the number of samples you want to test
X = df_sample[['WARD', 'TIME ENCODED', 'MONTH_SIN', 'MONTH_COS', 'WEEKEND', 'LATITUDE', 'LONGITUDE', 'DAY_OF_WEEK']]
y = df_sample['OFFENSES']

In [62]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the SVM classifier
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

           2       0.00      0.00      0.00       2.0
           6       0.00      0.00      0.00       0.0
           7       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Decision-Tree adapted:

In [63]:
# Select features and target
X = df[['WARD', 'TIME ENCODED', 'MONTH_SIN', 'MONTH_COS', 'WEEKEND', 'LATITUDE', 'LONGITUDE']]
y = df['OFFENSES']

In [64]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [65]:
# Train the Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.2208377851800641
Classification Report:
               precision    recall  f1-score   support

           0       0.15      0.16      0.15      7010
           1       0.28      0.28      0.28     13718
           2       0.19      0.19      0.19      8679
           3       0.15      0.15      0.15      4580
           4       0.16      0.16      0.16      6659
           5       0.13      0.14      0.13      5053
           6       0.37      0.37      0.37      9471
           7       0.19      0.18      0.18      8474

    accuracy                           0.22     63644
   macro avg       0.20      0.20      0.20     63644
weighted avg       0.22      0.22      0.22     63644



In [66]:
# # Set up the parameter grid for hyperparameter tuning
# param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [None, 5, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Perform grid search
# clf = DecisionTreeClassifier(random_state=42)
# grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Best parameters and model evaluation
# best_clf = grid_search.best_estimator_
# y_pred = best_clf.predict(X_test)

# print("Best Parameters:", grid_search.best_params_)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))
