In [3]:
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import pandas as pd
from meteostat import Point, Daily
from pathlib import Path

In [4]:
crime_df = pd.read_csv('Crime_Data.csv')

In [5]:
oldest_date = datetime.strptime(crime_df['DateReported'].min(), '%Y/%m/%d %H:%M:%S+00')
newest_date = datetime.strptime(crime_df['DateReported'].max(), '%Y/%m/%d %H:%M:%S+00')

In [6]:
charlottesville = Point(38.03, -78.478889)
data = Daily(charlottesville, oldest_date, newest_date)
weather_data = data.fetch()

In [7]:
crime_df['DateReported'] = pd.to_datetime(crime_df['DateReported'])

In [8]:
crime_by_date = crime_df.groupby(crime_df['DateReported'].dt.date).size().reset_index(name='Crime Count')
crime_by_date.rename(columns={'DateReported': 'Date'}, inplace=True)

In [10]:
crime_by_date.tail()

Unnamed: 0,Date,Crime Count
1819,2025-07-04,12
1820,2025-07-05,9
1821,2025-07-06,9
1822,2025-07-07,8
1823,2025-07-08,14


In [11]:
weather_data = weather_data.reset_index()
weather_data['time'] = pd.to_datetime(weather_data['time'])

In [12]:
crime_by_date['Date'] = pd.to_datetime(crime_by_date['Date'])
weather_data['time'] = weather_data['time'].dt.date
weather_data['time'] = pd.to_datetime(weather_data['time'])

In [18]:
crime_df['Date'] = pd.to_datetime(crime_df['DateReported'].dt.date)


In [13]:
merged_data = pd.merge(crime_by_date, weather_data, left_on='Date', right_on='time', how='inner')


In [20]:
merged_data_all = pd.merge(crime_df, weather_data, left_on='Date', right_on='time', how='left')

In [22]:
merged_data.columns

Index(['Date', 'Crime Count', 'time', 'tavg', 'tmin', 'tmax', 'prcp', 'snow',
       'wdir', 'wspd', 'wpgt', 'pres', 'tsun'],
      dtype='object')

In [23]:
merged_data_all.columns

Index(['RecordID', 'Offense', 'IncidentID', 'BlockNumber', 'StreetName',
       'Agency', 'DateReported', 'HourReported', 'ReportingOfficer', 'Date',
       'time', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt',
       'pres', 'tsun'],
      dtype='object')

In [25]:
complete_data = pd.merge(merged_data_all, merged_data[["Date", "Crime Count"]], on='Date', how='left')

In [29]:
complete_data.columns

Index(['RecordID', 'Offense', 'IncidentID', 'BlockNumber', 'StreetName',
       'Agency', 'DateReported', 'HourReported', 'ReportingOfficer', 'Date',
       'time', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt',
       'pres', 'tsun', 'Crime Count'],
      dtype='object')

In [31]:
complete_data.head(20)

Unnamed: 0,RecordID,Offense,IncidentID,BlockNumber,StreetName,Agency,DateReported,HourReported,ReportingOfficer,Date,...,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,Crime Count
0,1,Assist Citizen - Mental/TDO/ECO,202500023705,2100.0,"MICHIE DR, 35B",CPD,2025-07-08 21:23:03+00:00,1723,"Crowley, Raeann",2025-07-08,...,22.8,33.3,0.8,,,7.2,,1016.3,,14.0
1,2,Shots Fired/Illegal Hunting,202500023699,600.0,10 1/2 ST NW,CPD,2025-07-08 20:48:27+00:00,1648,"Curry, Brian",2025-07-08,...,22.8,33.3,0.8,,,7.2,,1016.3,,14.0
2,3,Sex Offense - Forcible Sodomy,202500023694,200.0,2ND ST NW,CPD,2025-07-08 20:35:32+00:00,1635,"O'Briant, Landon",2025-07-08,...,22.8,33.3,0.8,,,7.2,,1016.3,,14.0
3,4,Vandalism,202500023691,1100.0,E MARKET ST,CPD,2025-07-08 20:22:29+00:00,1622,"Benbow, Lauren",2025-07-08,...,22.8,33.3,0.8,,,7.2,,1016.3,,14.0
4,5,Larceny - Shoplifitng,202500023686,500.0,W MAIN ST,CPD,2025-07-08 20:12:33+00:00,1612,"Benbow, Lauren",2025-07-08,...,22.8,33.3,0.8,,,7.2,,1016.3,,14.0
5,6,Assault Simple,202500023674,700.0,"MONTICELLO AVE, C",CPD,2025-07-08 17:52:45+00:00,1352,"Jones, Mark",2025-07-08,...,22.8,33.3,0.8,,,7.2,,1016.3,,14.0
6,7,Narcotics,202500023670,1200.0,AVON ST,CPD,2025-07-08 17:15:12+00:00,1315,"Lowry, Ryan",2025-07-08,...,22.8,33.3,0.8,,,7.2,,1016.3,,14.0
7,8,Larceny - Shoplifitng,202500023651,400.0,E MARKET ST,CPD,2025-07-08 14:47:12+00:00,1047,"O'Briant, Landon",2025-07-08,...,22.8,33.3,0.8,,,7.2,,1016.3,,14.0
8,9,Larceny - Theft from Building,202500023640,1200.0,EMMET ST N,CPD,2025-07-08 13:19:54+00:00,919,"Mian, Ghulam",2025-07-08,...,22.8,33.3,0.8,,,7.2,,1016.3,,14.0
9,10,Larceny - From Motor Vehicle,202500023630,900.0,E MARKET ST,CPD,2025-07-08 12:41:05+00:00,841,"Mian, Ghulam",2025-07-08,...,22.8,33.3,0.8,,,7.2,,1016.3,,14.0


In [32]:
complete_data['HourReported'] = complete_data['HourReported'].astype(int) // 100


In [38]:
complete_data[['HourReported', 'tavg', 'prcp', 'wspd', 'Crime Count']].isna().sum()


HourReported    0
tavg            0
prcp            0
wspd            0
Crime Count     0
dtype: int64

In [37]:
# Drop rows with missing 'Crime Count'
complete_data = complete_data.dropna(subset=['Crime Count'])

# Fill missing precipitation (prcp) and wind speed (wspd) with 0
complete_data[['prcp', 'wspd']] = complete_data[['prcp', 'wspd']].fillna(0)

# (Optional) Drop any rows that still have NaN in tavg if they remain
complete_data = complete_data.dropna(subset=['tavg'])


In [44]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [57]:
cat_offense_df = pd.read_csv('Crime_Data_cats.csv')

In [61]:
cat_offense_df['DateReported'] = pd.to_datetime(cat_offense_df['DateReported'])
cat_offense_df['Date'] = pd.to_datetime(cat_offense_df['DateReported'].dt.date)


In [62]:
complete_data = pd.merge(complete_data, cat_offense_df[["Date", "Offense_cat"]], on='Date', how='left')

In [63]:
complete_data.columns

Index(['RecordID', 'Offense', 'IncidentID', 'BlockNumber', 'StreetName',
       'Agency', 'DateReported', 'HourReported', 'ReportingOfficer', 'Date',
       'time', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt',
       'pres', 'tsun', 'Crime Count', 'Offense_cat'],
      dtype='object')

In [69]:
complete_data = complete_data.dropna(subset=['Crime Count'])
complete_data[['prcp', 'wspd']] = complete_data[['prcp', 'wspd']].fillna(0)
complete_data = complete_data.dropna(subset=['tavg'])
complete_data['Crime Count'] = (
    complete_data['Crime Count']
    .astype(str)
    .str.replace(r'[^0-9.\-]', '', regex=True)  # remove any commas or stray symbols
    .replace('', pd.NA)
    .astype(float)
)

## Linear Model

In [54]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

# --- Clean your data ---
complete_data = complete_data.dropna(subset=['Crime Count'])
complete_data[['prcp', 'wspd']] = complete_data[['prcp', 'wspd']].fillna(0)
complete_data = complete_data.dropna(subset=['tavg'])
complete_data['Crime Count'] = (
    complete_data['Crime Count']
    .astype(str)
    .str.replace(r'[^0-9.\-]', '', regex=True)  # remove any commas or stray symbols
    .replace('', pd.NA)
    .astype(float)
)


# --- Ensure numeric types ---
cols_to_convert = ['HourReported', 'tavg', 'prcp', 'wspd', 'Crime Count']
complete_data[cols_to_convert] = complete_data[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# Drop any rows that became NaN after conversion
complete_data = complete_data.dropna(subset=cols_to_convert)

# --- Define features and target ---
X = complete_data[['HourReported', 'tavg', 'prcp', 'wspd']]
y = complete_data['Crime Count']

# Add constant term for intercept
X = sm.add_constant(X)

# --- Split and fit ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.astype('float64')
y_train = y_train.astype('float64')

model = sm.OLS(y_train, X_train).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:            Crime Count   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.028
Method:                 Least Squares   F-statistic:                     149.4
Date:                Mon, 10 Nov 2025   Prob (F-statistic):          3.41e-126
Time:                        11:23:43   Log-Likelihood:                -60152.
No. Observations:               20402   AIC:                         1.203e+05
Df Residuals:                   20397   BIC:                         1.204e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           14.0591      0.126    111.529   

## Linear Model dropping insignificant predictors

In [56]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

# --- Clean your data ---
complete_data = complete_data.dropna(subset=['Crime Count'])
complete_data[['prcp', 'wspd']] = complete_data[['prcp', 'wspd']].fillna(0)
complete_data = complete_data.dropna(subset=['tavg'])
complete_data['Crime Count'] = (
    complete_data['Crime Count']
    .astype(str)
    .str.replace(r'[^0-9.\-]', '', regex=True)  # remove any commas or stray symbols
    .replace('', pd.NA)
    .astype(float)
)


# --- Ensure numeric types ---
cols_to_convert = [ 'tavg', 'prcp', 'wspd', 'Crime Count']
complete_data[cols_to_convert] = complete_data[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# Drop any rows that became NaN after conversion
complete_data = complete_data.dropna(subset=cols_to_convert)

# --- Define features and target ---
X = complete_data[['tavg', 'prcp', 'wspd']]
y = complete_data['Crime Count']

# Add constant term for intercept
X = sm.add_constant(X)

# --- Split and fit ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.astype('float64')
y_train = y_train.astype('float64')

model = sm.OLS(y_train, X_train).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:            Crime Count   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.028
Method:                 Least Squares   F-statistic:                     198.9
Date:                Thu, 20 Nov 2025   Prob (F-statistic):          3.68e-127
Time:                        12:00:21   Log-Likelihood:                -60152.
No. Observations:               20402   AIC:                         1.203e+05
Df Residuals:                   20398   BIC:                         1.203e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.9842      0.102    137.442      0.0

## K Means Clustering

In [64]:
import pandas as pd
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [65]:
complete_data.columns

Index(['RecordID', 'Offense', 'IncidentID', 'BlockNumber', 'StreetName',
       'Agency', 'DateReported', 'HourReported', 'ReportingOfficer', 'Date',
       'time', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt',
       'pres', 'tsun', 'Crime Count', 'Offense_cat'],
      dtype='object')

In [70]:
X=complete_data[["Crime Count", "tavg","prcp","wspd","HourReported"]]

In [71]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("kmeans", KMeans(n_clusters=3, init="k-means++", n_init=10, random_state=123))
])

pipe.fit(X)
labels = pipe["kmeans"].labels_

In [72]:
complete_data["cluster"] = labels.astype(str)
complete_data.head()

Unnamed: 0,RecordID,Offense,IncidentID,BlockNumber,StreetName,Agency,DateReported,HourReported,ReportingOfficer,Date,...,prcp,snow,wdir,wspd,wpgt,pres,tsun,Crime Count,Offense_cat,cluster
0,1,Assist Citizen - Mental/TDO/ECO,202500023705,2100.0,"MICHIE DR, 35B",CPD,2025-07-08 21:23:03+00:00,17,"Crowley, Raeann",2025-07-08,...,0.8,,,7.2,,1016.3,,14.0,Mental Health/Welfare,1
1,1,Assist Citizen - Mental/TDO/ECO,202500023705,2100.0,"MICHIE DR, 35B",CPD,2025-07-08 21:23:03+00:00,17,"Crowley, Raeann",2025-07-08,...,0.8,,,7.2,,1016.3,,14.0,Weapon Offenses,1
2,1,Assist Citizen - Mental/TDO/ECO,202500023705,2100.0,"MICHIE DR, 35B",CPD,2025-07-08 21:23:03+00:00,17,"Crowley, Raeann",2025-07-08,...,0.8,,,7.2,,1016.3,,14.0,Assault & Violent Crime,1
3,1,Assist Citizen - Mental/TDO/ECO,202500023705,2100.0,"MICHIE DR, 35B",CPD,2025-07-08 21:23:03+00:00,17,"Crowley, Raeann",2025-07-08,...,0.8,,,7.2,,1016.3,,14.0,Property Damage,1
4,1,Assist Citizen - Mental/TDO/ECO,202500023705,2100.0,"MICHIE DR, 35B",CPD,2025-07-08 21:23:03+00:00,17,"Crowley, Raeann",2025-07-08,...,0.8,,,7.2,,1016.3,,14.0,Theft & Larceny,1


In [73]:
K_values = list(range(1, 11))
wcss = []

for k in K_values:
    pipe.set_params(kmeans__n_clusters=k)
    pipe.fit(X)
    inertia = pipe["kmeans"].inertia_
    wcss.append(inertia)


fig = px.line(
    x=K_values,
    y=wcss,
    markers=True,
    title="Elbow Plot (Pipeline: StandardScaler + KMeans)",
    labels={"x": "Number of Clusters (K)", "y": "WCSS / Inertia"}
)

fig.update_layout(height=500, width=700)
fig.show()

In [74]:
from sklearn.metrics import silhouette_score

In [75]:
sil_scores = []
K_values_sil = list(range(2, 11))

for k in K_values_sil:
    pipe.set_params(kmeans__n_clusters=k)
    pipe.fit(X)

    labels = pipe["kmeans"].labels_

    sil = silhouette_score(X, labels)

    sil_scores.append(sil)


fig = px.line(
    x=K_values_sil,
    y=sil_scores,
    markers=True,
    title="Silhouette Scores (Pipeline: StandardScaler + KMeans)",
    labels={"x": "Number of Clusters (K)", "y": "Silhouette Score"}
)

fig.update_layout(height=500, width=700)
fig.show()
