# TeamA Machine Learning Project
## Los Angeles Crime Data 2020-Present Desiscion Tree Approach
### Saulo Guzman and Alex Philipsen

---

Importing Libraries and Data

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
uneditedDF = pd.read_csv("Crime_Data_from_2020_to_Present.csv", nrows=10000)

---
## Cleaning Data
We are making use of the features of latitude, longitude, area name, time of day, and crime code.

In [3]:
crimeDF = uneditedDF[['AREA', 'TIME OCC', 'Crm Cd','LAT', 'LON', 'Vict Age']]
print(crimeDF.head(1))

   AREA  TIME OCC  Crm Cd      LAT       LON  Vict Age
0     7      2130     510  34.0375 -118.3506         0


---
## Implementing  Classifier Decision tree

In [4]:
# Splitting data between train, test, and valid sets
trainSet = crimeDF.sample(frac=0.4)
crimeDF = crimeDF.drop(trainSet.index)
testSet = crimeDF.sample(frac=0.3)
crimeDF = crimeDF.drop(testSet.index)
validSet = crimeDF.sample(frac=0.3)
crimeDF = crimeDF.drop(validSet.index)

# Splitting each set into X and y

X_train = trainSet[['AREA', 'TIME OCC', 'LAT', 'LON', 'Vict Age']]
y_train = trainSet['Crm Cd']
X_test = testSet[['AREA', 'TIME OCC', 'LAT', 'LON', 'Vict Age']]
y_test = testSet['Crm Cd']
X_valid = validSet[['AREA', 'TIME OCC', 'LAT', 'LON', 'Vict Age']]
y_valid = validSet['Crm Cd']

areaCodeToName = (uneditedDF[['AREA', 'AREA NAME']]
                    .drop_duplicates()
                    .sort_values(by='AREA')
                    )

In [5]:

# Initialize and train the decision tree classifier
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)

# Predict on the test set
y_pred = dt.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Predict on the validation set
y_valid_pred = dt.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
print(f'Validation Accuracy: {valid_accuracy * 100:.2f}%')

Accuracy: 23.06%
Validation Accuracy: 22.78%


In [6]:

feature_importances = dt.feature_importances_
importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})


importances_df = importances_df.sort_values(by='Importance', ascending=False)

print(importances_df)

    Feature  Importance
4  Vict Age    0.658242
1  TIME OCC    0.232405
2       LAT    0.055317
0      AREA    0.039328
3       LON    0.014708


---
## Implementing  Classifier Decision tree while binning feature

In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
area_encoded = encoder.fit_transform(X_train[['AREA']]).toarray()
area_feature_names = encoder.get_feature_names_out(['AREA'])

# Replace 'AREA' in X_train, X_test, X_valid
X_train = pd.concat([X_train.drop('AREA', axis=1), pd.DataFrame(area_encoded, columns=area_feature_names)], axis=1)
X_test = pd.concat([X_test.drop('AREA', axis=1), pd.DataFrame(encoder.transform(X_test[['AREA']]).toarray(), columns=area_feature_names)], axis=1)
X_valid = pd.concat([X_valid.drop('AREA', axis=1), pd.DataFrame(encoder.transform(X_valid[['AREA']]).toarray(), columns=area_feature_names)], axis=1)

In [8]:
bins = [0, 600, 1200, 1800, 2400]
labels = ['Late Night', 'Morning', 'Afternoon', 'Evening']

# Add 'Time Bin' column
X_train['Time Bin'] = pd.cut(X_train['TIME OCC'], bins=bins, labels=labels, right=False)
X_test['Time Bin'] = pd.cut(X_test['TIME OCC'], bins=bins, labels=labels, right=False)
X_valid['Time Bin'] = pd.cut(X_valid['TIME OCC'], bins=bins, labels=labels, right=False)

In [9]:
encoder_time_bin = OneHotEncoder()
time_bin_encoded = encoder_time_bin.fit_transform(X_train[['Time Bin']]).toarray()
time_bin_feature_names = encoder_time_bin.get_feature_names_out(['Time Bin'])

# Replace 'Time Bin' with its encoded features
X_train = pd.concat([X_train.drop('Time Bin', axis=1), pd.DataFrame(time_bin_encoded, columns=time_bin_feature_names, index=X_train.index)], axis=1)
X_test = pd.concat([X_test.drop('Time Bin', axis=1), pd.DataFrame(encoder_time_bin.transform(X_test[['Time Bin']]).toarray(), columns=time_bin_feature_names, index=X_test.index)], axis=1)
X_valid = pd.concat([X_valid.drop('Time Bin', axis=1), pd.DataFrame(encoder_time_bin.transform(X_valid[['Time Bin']]).toarray(), columns=time_bin_feature_names, index=X_valid.index)], axis=1)

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[['LAT', 'LON']] = scaler.fit_transform(X_train[['LAT', 'LON']])
X_test[['LAT', 'LON']] = scaler.transform(X_test[['LAT', 'LON']])
X_valid[['LAT', 'LON']] = scaler.transform(X_valid[['LAT', 'LON']])

In [11]:
X_train = X_train.dropna()
X_test = X_test.dropna()
X_valid = X_valid.dropna()

In [12]:
y_train = y_train.loc[X_train.index]
y_test = y_test.loc[X_test.index]
y_valid = y_valid.loc[X_valid.index]

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 46.18%


In [14]:
#After binning the data set.
# Initialize and train the decision tree classifier
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)

# Predict on the test set
y_pred = dt.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Predict on the validation set
y_valid_pred = dt.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
print(f'Validation Accuracy: {valid_accuracy * 100:.2f}%')

Accuracy: 54.46%
Validation Accuracy: 56.33%
