In [3]:
# ==========================================
# 1Ô∏è‚É£ IMPORTS & DATA LOADING
# ==========================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

# Load preprocessed data
df = pd.read_csv(r"C:\Users\dell\Desktop\Sakhi-Women-Safety-App\dataset\preprocessed_data.csv")

print("Data Loaded Successfully!")


Data Loaded Successfully!


In [4]:
df.head()


Unnamed: 0,nm_pol,area,rape,gangrape,sexual_harassment,totarea,lat,long,totalcrime,crime_per_area,hybrid_score,risk_score,risk_cluster,risk_level
0,CHITRANJAN PARK,2.65933,6,1,7,2659329.537,28.53632,77.2492,512,192.529731,22.426752,23.053469,0,Very Low
1,DABRI,3.401013,28,0,16,3401013.428,28.61268,77.086,397,116.729912,77.25816,79.417141,1,High
2,MALVIYA NAGAR,1.379854,28,1,15,1379853.572,28.52989,77.20418,837,606.586102,76.42685,78.5626,1,High
3,CHANDNI MAHAL,5.570696,8,1,7,5570696.132,28.64361,77.23608,588,105.552338,26.269507,27.003609,2,Low
4,MODEL TOWN,2.689157,4,1,14,2689157.085,28.70257,77.19369,466,173.288501,28.847573,29.653719,2,Low


In [5]:
# ==========================================
# 2Ô∏è‚É£ DEFINE WOMEN-CRIME FEATURES
# ==========================================
women_features = ['rape', 'gangrape', 'sexual_harassment']
target = 'totalcrime'

# Safety check
print("Women-related features:", women_features)
print("Prediction baseline:", target)


Women-related features: ['rape', 'gangrape', 'sexual_harassment']
Prediction baseline: totalcrime


In [6]:
# ==========================================
# 3Ô∏è‚É£ ASSIGN DOMAIN SEVERITY WEIGHTS
# ==========================================

severity_weights = {
    'rape': 5,
    'gangrape': 5,
    'sexual_harassment': 3
}

print("Severity Weights Set:", severity_weights)


Severity Weights Set: {'rape': 5, 'gangrape': 5, 'sexual_harassment': 3}


In [7]:
# ==========================================
# 4Ô∏è‚É£ RANDOM FOREST FEATURE IMPORTANCE (DATA-DRIVEN)
# ==========================================

X = df[women_features]
y = df[target]

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X, y)

importance_scores = rf_model.feature_importances_
importance_dict = dict(zip(women_features, importance_scores))

print("\nüîç ML-driven Feature Importances:")
for k, v in importance_dict.items():
    print(f"{k}: {v:.4f}")



üîç ML-driven Feature Importances:
rape: 0.3843
gangrape: 0.1270
sexual_harassment: 0.4887


In [8]:
# ==========================================
# 5Ô∏è‚É£ HYBRID RISK SCORE CALCULATION
# ==========================================

df['hybrid_score'] = 0

for feature in women_features:
    df['hybrid_score'] += (
        df[feature] *
        severity_weights[feature] *
        importance_dict[feature]
    )

df[['nm_pol', 'area', 'hybrid_score']].head()


Unnamed: 0,nm_pol,area,hybrid_score
0,CHITRANJAN PARK,2.65933,22.426752
1,DABRI,3.401013,77.25816
2,MALVIYA NAGAR,1.379854,76.42685
3,CHANDNI MAHAL,5.570696,26.269507
4,MODEL TOWN,2.689157,28.847573


In [9]:
# ==========================================
# 6Ô∏è‚É£ NORMALIZE SCORE (0‚Äì100) FOR RANKING
# ==========================================

scaler = MinMaxScaler(feature_range=(0, 100))
df['risk_score'] = scaler.fit_transform(df[['hybrid_score']])

df[['nm_pol','area','risk_score']].head(10)


Unnamed: 0,nm_pol,area,risk_score
0,CHITRANJAN PARK,2.65933,23.053469
1,DABRI,3.401013,79.417141
2,MALVIYA NAGAR,1.379854,78.5626
3,CHANDNI MAHAL,5.570696,27.003609
4,MODEL TOWN,2.689157,29.653719
5,ANANDVIHAR,4.55897,50.258961
6,KASHMERE GATE,1.62791,37.819987
7,GOVIND PURI,8.992712,17.394245
8,BINDAPUR,2.56808,20.610527
9,NEW FRIENDS COLONY,4.045387,29.550135


Risk classification


In [12]:
# ==========================================
# 8Ô∏è‚É£ RISK CLASSIFICATION USING K-MEANS
# ==========================================

from sklearn.cluster import KMeans
import numpy as np

# Prepare data for clustering
score_data = df[['risk_score']].copy()

# Apply K-Means with 4 clusters (you can change to 3 if needed)
kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
df['risk_cluster'] = kmeans.fit_predict(score_data)

# Order clusters by mean score (so that label matches severity)
cluster_order = df.groupby('risk_cluster')['risk_score'].mean().sort_values().index.tolist()

label_map = {
    cluster_order[0]: 'Very Low',
    cluster_order[1]: 'Low',
    cluster_order[2]: 'Medium',
    cluster_order[3]: 'High'
}

# Assign final risk level
df['risk_level'] = df['risk_cluster'].map(label_map)

df[['nm_pol', 'area', 'risk_score', 'risk_level']].head(10)


Unnamed: 0,nm_pol,area,risk_score,risk_level
0,CHITRANJAN PARK,2.65933,23.053469,Very Low
1,DABRI,3.401013,79.417141,High
2,MALVIYA NAGAR,1.379854,78.5626,High
3,CHANDNI MAHAL,5.570696,27.003609,Low
4,MODEL TOWN,2.689157,29.653719,Low
5,ANANDVIHAR,4.55897,50.258961,Medium
6,KASHMERE GATE,1.62791,37.819987,Low
7,GOVIND PURI,8.992712,17.394245,Very Low
8,BINDAPUR,2.56808,20.610527,Very Low
9,NEW FRIENDS COLONY,4.045387,29.550135,Low


In [13]:
df.to_csv("../dataset/final_women_safety_data.csv", index=False)
print("‚úî final_women_safety_data.csv stored successfully!")


‚úî final_women_safety_data.csv stored successfully!
