In [9]:
import pandas as pd
from zipfile import ZipFile
import seaborn as sns 
import matplotlib.pyplot as plt

In [10]:
#Opening the zip file and extracting the dataset
ZipFile("pd_nibrs_datasd.zip").extractall()

#Read the dataset into a DataFrame
df = pd.read_csv('pd_nibrs_datasd.csv', low_memory=False)

#Create a new DataFrame keeping only the relevant data
data = df[['case_number', 'occured_on', 'code_section', 'pd_offense_category',
                'neighborhood', 'division', 'block_addr', 'city', 'zip', 'latitude', 'longitude']]

In [11]:
data.head()

Unnamed: 0,case_number,occured_on,code_section,pd_offense_category,neighborhood,division,block_addr,city,zip,latitude,longitude
0,21032576,2021-07-11 20:45:00,148 (A)(1) PC OBSTRUCT/RESIST PEACE OFCR/EMER ...,All Other Offenses,Stockton,Central,100 31st ST,SAN DIEGO,92102.0,32.706474,-117.127455
1,22701933,2022-02-11 22:00:00,459 PC BURGLARY (VEHICLE) (F) ||,Theft From Motor Vehicle,Mira Mesa,Northeastern,10000 MAYA LINDA ROAD,SAN DIEGO,92126.0,32.901073,-117.12012
2,21020993,2021-05-02 06:00:00,273.5 (A) PC SPOUSAL/COHABITANT ABUSE WITH MIN...,Simple Assault,Midway District,Western,3800 Greenwood ST,SAN DIEGO,92110.0,32.754899,-117.206022
3,22041849,2022-09-21 12:46:21,273 A (A) PC WILLFUL CRUELTY TO CHILD: WITH IN...,Aggravated Assault,Unknown,Unknown,,,92254.0,,
4,21001415,2021-01-09 11:23:00,10851 (A) VC OTHER AGENCY VEHICLE THEFT/RECOVE...,Stolen Property Offenses,Egger Highlands,Southern,1100 Walnut AVE,CHULA VISTA,91911.0,32.605965,-117.088752


In [12]:
# Display basic info about the DataFrame
print("DataFrame Info:")
data.info()

# Checking for missing values in each column
print("\nMissing values in each column:")
print(data.isnull().sum())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432566 entries, 0 to 432565
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   case_number          432566 non-null  object 
 1   occured_on           432566 non-null  object 
 2   code_section         432560 non-null  object 
 3   pd_offense_category  432566 non-null  object 
 4   neighborhood         432562 non-null  object 
 5   division             432560 non-null  object 
 6   block_addr           401453 non-null  object 
 7   city                 423900 non-null  object 
 8   zip                  429326 non-null  float64
 9   latitude             422616 non-null  float64
 10  longitude            422616 non-null  float64
dtypes: float64(3), object(8)
memory usage: 36.3+ MB

Missing values in each column:
case_number                0
occured_on                 0
code_section               6
pd_offense_category        0
neighborh

In [13]:
# Convert 'occured_on' to datetime format
data['occured_on'] = pd.to_datetime(data['occured_on'], errors='coerce')

# Extract year, month, day of week, day, hour, and minute from 'occured_on'
data['year'] = data['occured_on'].dt.year
data['month'] = data['occured_on'].dt.month
data['day_of_week'] = data['occured_on'].dt.dayofweek  # Monday=0, Sunday=6
data['day'] = data['occured_on'].dt.day
data['hour'] = data['occured_on'].dt.hour
data['minute'] = data['occured_on'].dt.minute

data = data.drop(columns=['occured_on'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['occured_on'] = pd.to_datetime(data['occured_on'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['occured_on'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = data['occured_on'].dt.month
A value is trying to be set on a copy of a sli

In [14]:
# Fill NaN values with a placeholder, for example, '0'
data['zip'] = data['zip'].fillna(0).astype(int).astype(str)

In [16]:
# Drop rows that still have missing values in 'latitude' or 'longitude'
data = data.dropna(subset=['latitude', 'longitude'])

In [17]:
# Display basic info about the DataFrame
print("DataFrame Info:")
data.info()

# Checking for missing values in each column
print("\nMissing values in each column:")
print(data.isnull().sum())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 422616 entries, 0 to 432565
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   case_number          422616 non-null  object 
 1   code_section         422611 non-null  object 
 2   pd_offense_category  422616 non-null  object 
 3   neighborhood         422613 non-null  object 
 4   division             422611 non-null  object 
 5   block_addr           399617 non-null  object 
 6   city                 422604 non-null  object 
 7   zip                  422616 non-null  object 
 8   latitude             422616 non-null  float64
 9   longitude            422616 non-null  float64
 10  year                 422616 non-null  int32  
 11  month                422616 non-null  int32  
 12  day_of_week          422616 non-null  int32  
 13  day                  422616 non-null  int32  
 14  hour                 422616 non-null  int32  
 15  minute

In [18]:
# Export the cleaned data to a new CSV
data.to_csv('cleaned_crime_data.csv', index=False)

# Check the result
print("Data has been cleaned and saved as 'cleaned_crime_data.csv'.")

Data has been cleaned and saved as 'cleaned_crime_data.csv'.
