In [27]:
# 📌 1. Import Libraries
# --------------------------------------------
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder

In [28]:
# 📌 2. Load Dataset
# --------------------------------------------
df = pd.read_csv('../Data/dataset.csv')
print("Initial Shape:", df.shape)
df.head()

Initial Shape: (3000, 5)


Unnamed: 0,Crime_Category,Crime_Severity_Level,Use_of_Weapon,Repeat_Offender,IPC_Section_and_Punishment
0,Murder,Minor,1,1,IPC 302 - Life Imprisonment
1,Assault,Moderate,0,1,IPC 324 - Up to 3 years
2,Assault,Moderate,1,1,IPC 324 - Up to 3 years
3,Fraud,Severe,0,1,IPC 420 - Up to 7 years
4,Assault,Severe,0,1,IPC 324 - Up to 3 years


In [29]:
# --------------------------------------------
# 📌 3. Handle Inconsistent Data
# --------------------------------------------

# Strip extra spaces, title case for consistency
df['Crime_Category'] = df['Crime_Category'].astype(str).str.strip().str.title()
df['Crime_Severity_Level'] = df['Crime_Severity_Level'].astype(str).str.strip().str.title()

print("Unique Crime Categories:", df['Crime_Category'].unique())
print("Unique Severity Levels:", df['Crime_Severity_Level'].unique())


Unique Crime Categories: ['Murder' 'Assault' 'Fraud' 'Theft']
Unique Severity Levels: ['Minor' 'Moderate' 'Severe']


In [30]:
# --------------------------------------------
# 📌 4. DO NOT Drop Duplicates!
# --------------------------------------------

# ✅ Duplicates are allowed — we want 3000 rows.
# So we skip:
# df = df.drop_duplicates()

print("Shape after NOT dropping duplicates:", df.shape)


Shape after NOT dropping duplicates: (3000, 5)


In [31]:
# --------------------------------------------
# 📌 5. Handle Null Values
# --------------------------------------------

# Check for any nulls
print(df.isnull().sum())

# Drop rows with nulls if any (should be none)
df = df.dropna()
print("Shape after dropping nulls (if any):", df.shape)


Crime_Category                0
Crime_Severity_Level          0
Use_of_Weapon                 0
Repeat_Offender               0
IPC_Section_and_Punishment    0
dtype: int64
Shape after dropping nulls (if any): (3000, 5)


In [32]:
# --------------------------------------------
# 📌 6. Remove Outliers (basic, for numeric binary fields)
# --------------------------------------------

# Use Z-score for numeric binary fields:
numeric_cols = ['Use_of_Weapon', 'Repeat_Offender']
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]
print("Shape after outlier removal:", df.shape)


Shape after outlier removal: (3000, 5)


In [33]:
# --------------------------------------------
# 📌 7. Feature Engineering
# --------------------------------------------

# Example: Create flag if severity is Severe and weapon used
df['Severity_Weapon_Flag'] = np.where(
    (df['Crime_Severity_Level'] == 'Severe') & (df['Use_of_Weapon'] == 1),
    1, 0
)

print(df[['Crime_Category', 'Crime_Severity_Level', 'Use_of_Weapon', 'Severity_Weapon_Flag']].head())


  Crime_Category Crime_Severity_Level  Use_of_Weapon  Severity_Weapon_Flag
0         Murder                Minor              1                     0
1        Assault             Moderate              0                     0
2        Assault             Moderate              1                     0
3          Fraud               Severe              0                     0
4        Assault               Severe              0                     0


In [34]:
# --------------------------------------------
# 📌 8. Encode Categorical Variables
# --------------------------------------------

# Initialize encoders
le_category = LabelEncoder()
le_severity = LabelEncoder()
le_target = LabelEncoder()

# Encode features
df['Crime_Category'] = le_category.fit_transform(df['Crime_Category'])
df['Crime_Severity_Level'] = le_severity.fit_transform(df['Crime_Severity_Level'])
df['IPC_Section_and_Punishment'] = le_target.fit_transform(df['IPC_Section_and_Punishment'])

print(df.head())


   Crime_Category  Crime_Severity_Level  Use_of_Weapon  Repeat_Offender  \
0               2                     0              1                1   
1               0                     1              0                1   
2               0                     1              1                1   
3               1                     2              0                1   
4               0                     2              0                1   

   IPC_Section_and_Punishment  Severity_Weapon_Flag  
0                           0                     0  
1                           1                     0  
2                           1                     0  
3                           3                     0  
4                           1                     0  


In [35]:
# --------------------------------------------
# 📌 9. Data Reduction
# --------------------------------------------

# No extra columns here, so nothing to drop.
# df = df.drop(['Unwanted_Column'], axis=1)

print("✅ Final shape:", df.shape)


✅ Final shape: (3000, 6)


In [36]:
# --------------------------------------------
# 📌 10. Save Preprocessed Data to CSV
# --------------------------------------------

df.to_csv('../Data/preprocessed_crime_dataset.csv', index=False)
print("✅ Preprocessed dataset saved at ../Data/preprocessed_crime_dataset.csv")


✅ Preprocessed dataset saved at ../Data/preprocessed_crime_dataset.csv


In [39]:
# ================================
# 1️⃣  IMPORTS
# ================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib

# ================================
# 2️⃣  LOAD RAW DATA
# ================================
# Adjust the path as needed
df = pd.read_csv('../Data/dataset.csv')
print(f"✅ Original shape: {df.shape}")

# ================================
# 3️⃣  DATA CLEANING
# ================================

# 3.1 Drop duplicates
df.drop_duplicates(inplace=True)
print(f"✅ After removing duplicates: {df.shape}")

# 3.2 Drop nulls
df.dropna(inplace=True)
print(f"✅ After dropping nulls: {df.shape}")

# 3.3 Optional: Handle outliers
# Example: Outliers not typical here since it's categorical.
# But for numeric: you could do z-score or IQR based removal if needed.

# ================================
# 4️⃣  ENCODING TARGET VARIABLE
# ================================
le = LabelEncoder()
df['IPC_Section_and_Punishment'] = le.fit_transform(df['IPC_Section_and_Punishment'])

# ✅ Print the encoder classes to verify order
print(f"✅ LabelEncoder classes_: {le.classes_}")

# ✅ Save the encoder for your Flask app
joblib.dump(le, '../App/model/label_encoder.pkl')
print("✅ LabelEncoder saved as: App/model/label_encoder.pkl")

# ================================
# 5️⃣  FEATURE ENGINEERING
# ================================
# Example: Add Severity_Weapon_Flag
df['Severity_Weapon_Flag'] = np.where(
    (df['Crime_Severity_Level'] == 'Severe') & (df['Use_of_Weapon'] == 1),
    1,
    0
)

# If 'Crime_Severity_Level' is string, map it to numeric:
severity_map = {'Minor': 0, 'Moderate': 1, 'Severe': 2}
df['Crime_Severity_Level'] = df['Crime_Severity_Level'].map(severity_map)

# ✅ Also encode 'Crime_Category' if needed:
category_map = {'Theft': 0, 'Murder': 1, 'Assault': 2, 'Fraud': 3}
df['Crime_Category'] = df['Crime_Category'].map(category_map)

# ================================
# 6️⃣  SAVE FINAL CLEANED DATA
# ================================
df.to_csv('../Data/preprocessed_dataset.csv', index=False)
print(f"✅ Final preprocessed dataset saved: {df.shape}")
print(df.head())


✅ Original shape: (3000, 5)
✅ After removing duplicates: (48, 5)
✅ After dropping nulls: (48, 5)
✅ LabelEncoder classes_: ['IPC 302 - Life Imprisonment' 'IPC 324 - Up to 3 years'
 'IPC 379 - Up to 3 years' 'IPC 420 - Up to 7 years']
✅ LabelEncoder saved as: App/model/label_encoder.pkl
✅ Final preprocessed dataset saved: (48, 6)
   Crime_Category  Crime_Severity_Level  Use_of_Weapon  Repeat_Offender  \
0               1                     0              1                1   
1               2                     1              0                1   
2               2                     1              1                1   
3               3                     2              0                1   
4               2                     2              0                1   

   IPC_Section_and_Punishment  Severity_Weapon_Flag  
0                           0                     0  
1                           1                     0  
2                           1                     0  
3  