In [2]:
import pandas as pd

# File path and name
file_path = r"C:\Users\C SaiVishwanath\OneDrive - Exel Industries\Data Analysis\Data Analytics Immersion\Advanced Analytics & Dashboard Design\Raw Data\gun-violence.csv"

# Load the dataset
data = pd.read_csv(file_path)

In [3]:
# Check the structure and data types
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239677 entries, 0 to 239676
Data columns (total 29 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   incident_id                  239677 non-null  int64  
 1   date                         239677 non-null  object 
 2   state                        239677 non-null  object 
 3   city_or_county               239677 non-null  object 
 4   address                      223180 non-null  object 
 5   n_killed                     239677 non-null  int64  
 6   n_injured                    239677 non-null  int64  
 7   incident_url                 239677 non-null  object 
 8   source_url                   239209 non-null  object 
 9   incident_url_fields_missing  239677 non-null  bool   
 10  congressional_district       227733 non-null  float64
 11  gun_stolen                   140179 non-null  object 
 12  gun_type                     140226 non-null  object 
 13 

In [6]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

incident_id                         0
date                                0
state                               0
city_or_county                      0
address                         16497
n_killed                            0
n_injured                           0
incident_url                        0
source_url                        468
incident_url_fields_missing         0
congressional_district          11944
gun_stolen                      99498
gun_type                        99451
incident_characteristics          326
latitude                         7923
location_description           197588
longitude                        7923
n_guns_involved                 99451
notes                           81017
participant_age                 92298
participant_age_group           42119
participant_gender              36362
participant_name               122253
participant_relationship       223903
participant_status              27626
participant_type                24863
sources     

In [7]:
# Drop columns with over 80% missing data
columns_to_drop = ['participant_relationship', 'location_description', 'participant_name']
data_cleaned = data.drop(columns=columns_to_drop)

# Fill missing values for categorical data with "Unknown"
categorical_fill = ['gun_stolen', 'gun_type', 'participant_age_group', 'participant_gender']
for col in categorical_fill:
    data_cleaned[col] = data_cleaned[col].fillna('Unknown')

# Fill missing numerical data with the median
numerical_fill = ['latitude', 'longitude', 'n_guns_involved']
for col in numerical_fill:
    data_cleaned[col] = data_cleaned[col].fillna(data_cleaned[col].median())

# Verify cleaning results
print(data_cleaned.isnull().sum())

incident_id                        0
date                               0
state                              0
city_or_county                     0
address                        16497
n_killed                           0
n_injured                          0
incident_url                       0
source_url                       468
incident_url_fields_missing        0
congressional_district         11944
gun_stolen                         0
gun_type                           0
incident_characteristics         326
latitude                           0
longitude                          0
n_guns_involved                    0
notes                          81017
participant_age                92298
participant_age_group              0
participant_gender                 0
participant_status             27626
participant_type               24863
sources                          609
state_house_district           38772
state_senate_district          32335
dtype: int64


In [8]:
# Convert the 'date' column to datetime format
data_cleaned['date'] = pd.to_datetime(data_cleaned['date'])

# Verify the conversion
print(data_cleaned['date'].head())

0   2013-01-01
1   2013-01-01
2   2013-01-01
3   2013-01-05
4   2013-01-07
Name: date, dtype: datetime64[ns]


In [10]:
print(data_cleaned.isnull().sum())

incident_id                        0
date                               0
state                              0
city_or_county                     0
address                        16497
n_killed                           0
n_injured                          0
incident_url                       0
source_url                       468
incident_url_fields_missing        0
congressional_district         11944
gun_stolen                         0
gun_type                           0
incident_characteristics         326
latitude                           0
longitude                          0
n_guns_involved                    0
notes                          81017
participant_age                92298
participant_age_group              0
participant_gender                 0
participant_status             27626
participant_type               24863
sources                          609
state_house_district           38772
state_senate_district          32335
dtype: int64


In [11]:
# Fill categorical/text columns with "Unknown"
text_columns = ['address', 'source_url', 'incident_characteristics', 'notes', 'participant_status', 'participant_type']
for col in text_columns:
    data_cleaned[col] = data_cleaned[col].fillna('Unknown')

# Fill numerical columns with -1 as a placeholder
numerical_columns = ['congressional_district', 'state_house_district', 'state_senate_district']
for col in numerical_columns:
    data_cleaned[col] = data_cleaned[col].fillna(-1)

# Confirm missing values after cleaning
print(data_cleaned.isnull().sum())

incident_id                        0
date                               0
state                              0
city_or_county                     0
address                            0
n_killed                           0
n_injured                          0
incident_url                       0
source_url                         0
incident_url_fields_missing        0
congressional_district             0
gun_stolen                         0
gun_type                           0
incident_characteristics           0
latitude                           0
longitude                          0
n_guns_involved                    0
notes                              0
participant_age                92298
participant_age_group              0
participant_gender                 0
participant_status                 0
participant_type                   0
sources                          609
state_house_district               0
state_senate_district              0
dtype: int64


In [12]:
# Confirm missing values in the dataset
remaining_missing = data_cleaned.isnull().sum()
print(remaining_missing[remaining_missing > 0])


participant_age    92298
sources              609
dtype: int64


In [13]:
# Fill missing values in 'sources' with 'Unknown'
data_cleaned['sources'] = data_cleaned['sources'].fillna('Unknown')

# Verify the change
print(data_cleaned['sources'].isnull().sum())


0


In [14]:
# Check the data type and a sample of the participant_age column
print(data_cleaned['participant_age'].dtype)
print(data_cleaned['participant_age'].head(10))

object
0                                    0::20
1                                    0::20
2        0::25||1::31||2::33||3::34||4::33
3               0::29||1::33||2::56||3::33
4               0::18||1::46||2::14||3::47
5               0::23||1::23||2::33||3::55
6    0::51||1::40||2::9||3::5||4::2||5::15
7                                      NaN
8                                      NaN
9                                    0::15
Name: participant_age, dtype: object


In [15]:
# Drop the 'participant_age' column
data_cleaned = data_cleaned.drop(columns=['participant_age'])

# Verify the column is dropped
print(data_cleaned.columns)

Index(['incident_id', 'date', 'state', 'city_or_county', 'address', 'n_killed',
       'n_injured', 'incident_url', 'source_url',
       'incident_url_fields_missing', 'congressional_district', 'gun_stolen',
       'gun_type', 'incident_characteristics', 'latitude', 'longitude',
       'n_guns_involved', 'notes', 'participant_age_group',
       'participant_gender', 'participant_status', 'participant_type',
       'sources', 'state_house_district', 'state_senate_district'],
      dtype='object')


In [18]:
import os

# Define the directory path
directory_path = r"C:\Users\C SaiVishwanath\OneDrive - Exel Industries\Data Analysis\Data Analytics Immersion\Advanced Analytics & Dashboard Design\Cleaned Data"

# Create the directory if it doesn't exist
os.makedirs(directory_path, exist_ok=True)

# Save the cleaned dataset
cleaned_file_path = os.path.join(directory_path, "gun-violence-cleaned.csv")
data_cleaned.to_csv(cleaned_file_path, index=False)

print("Cleaned dataset saved successfully!")

Cleaned dataset saved successfully!
