In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sqlite3
from scipy import stats
from sklearn.cluster import KMeans
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
#Assign database file to a variable
db_file = "Data/crimedata.db"

#Connect to database file
conn = sqlite3.connect(db_file)



In [3]:
#Query the la crime prediction data table and put into a pandas dataframe
#To cut down on the number of rows, this analysis will be done without year and month
crime_df = pd.read_sql_query("select count(*) as cnt, hour, area_code, area_name, crime_code, crime_code_description \
                               from new_crime_predict group by hour, area_code, area_name, crime_code, \
                              crime_code_description;", conn)
crime_df.shape

(5082, 6)

In [4]:
crime_df.head()

Unnamed: 0,cnt,hour,area_code,area_name,crime_code,crime_code_description
0,27,0,1,Central,121,"RAPE, FORCIBLE"
1,71,0,1,Central,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT"
2,28,0,1,Central,310,BURGLARY
3,74,0,1,Central,330,BURGLARY FROM VEHICLE
4,3,0,1,Central,440,SHOPLIFTING - PETTY THEFT ($950 & UNDER)


In [5]:
#Let's try to predict the type of crime based on the area and hour
#First, let's drop some crime codes to start out with a lower number

#choosing 121-rape, 624-battery/simple assault, 510-vehicle stolen, 230-assault with a deadly weapon
crime_array = ['121', '624', '510', '230']

crime_df = crime_df.loc[crime_df['crime_code'].isin(crime_array)]

#Get rid of rows where the hour is 0.  We aren't sure what this represents
crime_df = crime_df.loc[crime_df['hour'] != "0"]

#Convert the hour into a number
crime_df['hour'] = crime_df['hour'].apply(pd.to_numeric, errors='coerce')
crime_df.head()


Unnamed: 0,cnt,hour,area_code,area_name,crime_code,crime_code_description
209,13,1,1,Central,121,"RAPE, FORCIBLE"
210,94,1,1,Central,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT"
217,17,1,1,Central,510,VEHICLE - STOLEN
218,175,1,1,Central,624,BATTERY - SIMPLE ASSAULT
220,7,1,10,West Valley,121,"RAPE, FORCIBLE"


In [21]:
#Now set the X and y values

#First convert the hour and crime code to numeric
crime_df['hour'] = crime_df['hour'].apply(pd.to_numeric, errors='coerce')
crime_df['crime_code'] = crime_df['crime_code'].apply(pd.to_numeric, errors='coerce')

X = crime_df[['hour', 'area_name']]

X = pd.get_dummies(X)

y = crime_df["crime_code"].values.reshape(-1, 1)


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
y_train


(1420, 22)
(474, 22)
(1420, 1)
(474, 1)


array([[230],
       [624],
       [121],
       ...,
       [230],
       [230],
       [624]], dtype=int64)

In [28]:
hour_array = X_train['hour'].tolist()

crime_train = [val for sublist in y_train for val in sublist]

print(X_train.shape)
print(y_train.shape)



(1420, 22)
(1420, 1)


In [29]:
#Fit data to a LogisticRegression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

#Plot the residuals
#plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
#plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data", alpha=0.2, linewidths=1)
#plt.legend()
#plt.xlim(-0.15, .15)
#plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
#plt.title("Residual Plot")
#plt.show()

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test)
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 79883.77637130802, R2: 0.16033755274261605


In [None]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_true, y_predict)
score

In [None]:
fig1 = plt.figure(figsize=(12, 6))
axes1 = fig1.add_subplot(1, 2, 1)
axes2 = fig1.add_subplot(1, 2, 2)

axes1.set_title("Original Data")
axes2.set_title("Scaled Data")

maxx = X_train["hour"].max()
maxy = y_train.max()
#axes1.set_xlim(-maxx + 1, maxx + 1)
#axes1.set_ylim(-maxy + 1, maxy + 1)

maxy = y_train.max()

axes2.set_xlim(-2, 2)
axes2.set_ylim(-2, 2)

def set_axes(ax):
    ax.spines['left'].set_position('center')
    ax.spines['right'].set_color('none')
    ax.spines['bottom'].set_position('center')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    
set_axes(axes1)
set_axes(axes2)

axes1.scatter(hour_array, crime_train)
axes2.scatter(X_train_scaled[:,0], y_train_scaled[:])

In [None]:
#Convert data to numeric
crime_df['area_code'] = crime_df['area_code'].apply(pd.to_numeric, errors='coerce')
crime_df['victim_age'] = crime_df['victim_age'].apply(pd.to_numeric, errors='coerce')
crime_df['cnt'] = crime_df['cnt'].apply(pd.to_numeric, errors='coerce')
crime_df['crime_code'] = crime_df['crime_code'].apply(pd.to_numeric, errors='coerce')





In [None]:
#Standardize
clmns = ['cnt', 'area_code', 'crime_code', 'victim_age']
df_tr_std = stats.zscore(crime_df[clmns])



In [None]:
#Cluster the data
kmeans = KMeans(n_clusters=3, random_state=0).fit(df_tr_std)
labels = kmeans.labels_

#Glue back to originaal data
crime_df['clusters'] = labels

#Add the column into our list
clmns.extend(['clusters'])

#Lets analyze the clusters
print(crime_df[clmns].groupby(['clusters']).mean())

In [None]:
#Scatter plot of Crime Code and Area Code
sns.lmplot('crime_code', 'victim_age', 
           data=crime_df, 
           fit_reg=False, 
           hue="clusters",  
           scatter_kws={"marker": "D", 
                        "s": 100})
plt.title('Area Code vs Crime Type')
plt.xlabel('Area Code')
plt.ylabel('Crime Code')

In [None]:
crime_df = crime_df.drop(['clusters'], axis=1)
crime_df.head()

In [None]:
#Lets try to predict the type of crime based on a victim's age

