In [None]:
import pandas as pd

# Load the dataset
arrests_df = pd.read_csv('NYPD_Arrests_Data_Historic.csv')

# Define the columns to drop
columns_to_drop = ['ARREST_DATE', 
                   'ARREST_KEY', 
                   'PD_DESC', 
                   'PD_CD', 
                   'KY_CD', 
                   'LAW_CODE', 'X_COORD_CD', 'Y_COORD_CD', 'Lon_Lat', 'JURISDICTION_CODE', 'ARREST_BORO',
                   'ARREST_PRECINCT', 'JURISDICTION_CODE']

# Drop the specified columns
arrests_df = arrests_df.drop(columns=columns_to_drop)

# Select columns to check and remove NaN values
offense_columns = ['LAW_CAT_CD', 'OFNS_DESC', 'Latitude', 'Longitude']

# Drop rows with NaN values in the specified columns - only very small % of rows contained NaN
arrests_df = arrests_df.dropna(subset=offense_columns)

# Verify that rows with NaN have been removed
arrests_df.head()

In [None]:
# Get unique values in the 'LAW_CAT_CD' column
unique_law_cat_cd = arrests_df['LAW_CAT_CD'].unique()

# Print the unique values
print("Unique LAW_CAT_CD values:")
for value in unique_law_cat_cd:
    print(value)
    
# Count rows where LAW_CAT_CD is '9' or '(null)'
count_condition_9 = arrests_df[arrests_df['LAW_CAT_CD'] == '9'].shape[0]
count_condition_null = arrests_df[arrests_df['LAW_CAT_CD'].isnull()].shape[0]

# Alternatively, if nulls are represented as a string '(null)'
count_condition_null_string = arrests_df[arrests_df['LAW_CAT_CD'] == '(null)'].shape[0]

# Remove rows with bad data
arrests_df = arrests_df[(arrests_df['LAW_CAT_CD'] != '9') & (arrests_df['LAW_CAT_CD'] != '(null)')]


law_category_mapping = {
    'F': 3,  # Felony
    'M': 2,  # Misdemeanor
    'V': 1,  # Violation/Infraction
    'I': 1,  # Infraction or a specific category (assumed as minor here)
    '9': 0,  # Placeholder or undefined (adjust based on dataset context)
    '(null)': 0  # Treat null as missing or undefined
}

arrests_df['LAW_CAT_CD'] = arrests_df['LAW_CAT_CD'].map(law_category_mapping)

arrests_df.head()

In [None]:
severity_mapping = {
    "RAPE": 10,
    "SEX CRIMES": 8,
    "ARSON": 8,
    "JOSTLING": 2,
    "ESCAPE 3": 5,
    "ASSAULT 3 & RELATED OFFENSES": 6,
    "FORCIBLE TOUCHING": 7,
    "CRIMINAL TRESPASS": 3,
    "CRIMINAL MISCHIEF & RELATED OFFENSES": 4,
    "DANGEROUS DRUGS": 5,
    "FELONY ASSAULT": 9,
    "OTHER TRAFFIC INFRACTION": 1,
    "OTHER STATE LAWS (NON PENAL LAW)": 2,
    "PETIT LARCENY": 3,
    "OTHER STATE LAWS": 2,
    "OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV": 3,
    "POSSESSION OF STOLEN PROPERTY 5": 4,
    "INTOXICATED & IMPAIRED DRIVING": 6,
    "PROSTITUTION & RELATED OFFENSES": 4,
    "OFFENSES AGAINST PUBLIC ADMINISTRATION": 5,
    "BURGLARY": 7,
    "DANGEROUS WEAPONS": 8,
    "OTHER OFFENSES RELATED TO THEFT": 3,
    "BURGLAR'S TOOLS": 4,
    "GRAND LARCENY": 6,
    "ROBBERY": 7,
    "INTOXICATED/IMPAIRED DRIVING": 6,
    "MISCELLANEOUS PENAL LAW": 3,
    "OFFENSES INVOLVING FRAUD": 5,
    "THEFT-FRAUD": 5,
    "VEHICLE AND TRAFFIC LAWS": 2,
    "ANTICIPATORY OFFENSES": 4,
    "MOVING INFRACTIONS": 1,
    "POSSESSION OF STOLEN PROPERTY": 4,
    "DISORDERLY CONDUCT": 2,
    "GRAND LARCENY OF MOTOR VEHICLE": 7,
    "FORGERY": 5,
    "FRAUDS": 5,
    "HARRASSMENT 2": 3,
    "MURDER & NON-NEGL. MANSLAUGHTER": 10,
    "OFFENSES RELATED TO CHILDREN": 8,
    "HOMICIDE-NEGLIGENT-VEHICLE": 8,
    "UNAUTHORIZED USE OF A VEHICLE 3 (UUV)": 4,
    "ALCOHOLIC BEVERAGE CONTROL LAW": 2,
    "OFFENSES AGAINST THE PERSON": 7,
    "GAMBLING": 2,
    "UNLAWFUL POSS. WEAP. ON SCHOOL GROUNDS": 8,
    "FRAUDULENT ACCOSTING": 4,
    "HARASSMENT": 3,
    "KIDNAPPING & RELATED OFFENSES": 9,
    "OFFENSES AGAINST PUBLIC SAFETY": 7,
    "ADMINISTRATIVE CODE": 2,
    "LOITERING": 1,
    "LOITERING FOR DRUG PURPOSES": 2,
    "CHILD ABANDONMENT/NON SUPPORT 1": 6,
    "LOITERING/GAMBLING (CARDS, DICE, ETC)": 1,
    "HOMICIDE-NEGLIGENT,UNCLASSIFIED": 8,
    "DISRUPTION OF A RELIGIOUS SERVICE": 2,
    "PARKING OFFENSES": 1,
    "FORTUNE TELLING": 1,
    "OFFENSES AGAINST MARRIAGE UNCLASSIFIED": 2,
    "F.C.A. P.I.N.O.S.": 3,
    "ABORTION": 4,
    "CRIMINAL MISCHIEF & RELATED OF": 4,
    "OFFENSES AGAINST PUBLIC ADMINI": 5,
    "OTHER OFFENSES RELATED TO THEF": 3,
    "OFF. AGNST PUB ORD SENSBLTY &": 3,
    "UNAUTHORIZED USE OF A VEHICLE": 4,
    "AGRICULTURE & MRKTS LAW-UNCLASSIFIED": 2,
    "MURDER & NON-NEGL. MANSLAUGHTE": 10,
    "(null)": 0,  # Treating null as missing
    "NYS LAWS-UNCLASSIFIED FELONY": 6,
    "HOMICIDE-NEGLIGENT,UNCLASSIFIE": 8,
    "OTHER STATE LAWS (NON PENAL LA": 2,
    "ENDAN WELFARE INCOMP": 6,
    "THEFT OF SERVICES": 3,
    "CHILD ABANDONMENT/NON SUPPORT": 6,
    "CANNABIS RELATED OFFENSES": 3,
    "UNDER THE INFLUENCE, DRUGS": 5,
    "KIDNAPPING": 9,
    "NEW YORK CITY HEALTH CODE": 3,
    "LOITERING/GAMBLING (CARDS, DIC": 1,
    "DISRUPTION OF A RELIGIOUS SERV": 2,
    "UNLAWFUL POSS. WEAP. ON SCHOOL": 8,
    "NYS LAWS-UNCLASSIFIED VIOLATION": 2,
    "ADMINISTRATIVE CODES": 2,
    "FELONY SEX CRIMES": 9,
    "KIDNAPPING AND RELATED OFFENSES": 9,
    "LOITERING,BEGGING": 1
}

# Apply the severity mapping to create a new column
arrests_df['OFNS_DESC'] = arrests_df['OFNS_DESC'].map(severity_mapping)

# Optional: Save the modified DataFrame to a new CSV file
arrests_df.to_csv('reduced_NYPD_Arrests_Data.csv', index=False)

arrests_df.head()

In [None]:
import matplotlib.pyplot as plt

# Count the frequency of each offense description
offense_counts = arrests_df['OFNS_DESC'].value_counts()

# Plot the top 10 most common offenses
plt.figure(figsize=(12, 8))
offense_counts.head(10).plot(kind='bar')
plt.title('Top 10 Most Common Offense Descriptions')
plt.xlabel('Offense Description')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

# Drop rows with NaN values in the selected columns
columns_to_combine = ['PD_CD', 'PD_DESC', 'KY_CD', 'OFNS_DESC', 'LAW_CODE', 'LAW_CAT_CD']
arrests_df_clean = arrests_df.dropna(subset=columns_to_combine)

# Encode categorical columns to numeric using LabelEncoder
label_encoders = {}
for col in columns_to_combine:
    if arrests_df_clean[col].dtype == 'object':  # Only encode non-numeric columns
        le = LabelEncoder()
        arrests_df_clean[col] = le.fit_transform(arrests_df_clean[col])
        label_encoders[col] = le

# Extract the relevant data for PCA
X = arrests_df_clean[columns_to_combine]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce to 1 principal component
pca = PCA(n_components=1)
X_pca = pca.fit_transform(X_scaled)

# Add the PCA result to the DataFrame before dropping other columns
arrests_df_clean['Composite_Offense'] = X_pca[:, 0]  # Ensure correct indexing

# Print the explained variance ratio for the component
explained_variance = pca.explained_variance_ratio_
print(f"Explained Variance by Composite Feature: {explained_variance[0]:.2f}")

# Now drop the specified columns from the cleaned DataFrame
arrests_df_clean = arrests_df_clean.drop(columns=columns_to_combine)

# Save the cleaned DataFrame to a CSV file with the PCA column included
arrests_df_clean.to_csv('cleaned_arrests_data_pca_applied.csv', index=False)

# Print the first few rows to confirm changes
arrests_df_clean.head()

In [None]:
# An explained variance of 44% means that nearly half of the variability in the original data 
# (across the selected features) can be explained by this single composite score. 
# This is a reasonable amount of variance to capture with just one principal component, 
# especially considering that these features are likely to be somewhat correlated but may 
# contain a fair amount of unique information.
cleaned_arrests_data_pca_applied_df = pd.read_csv('cleaned_arrests_data_pca_applied.csv')
cleaned_arrests_data_pca_applied_df.head()