finding the missing values:


In [70]:
import numpy as np
import pandas as pd

df=pd.read_csv("womencrimes.csv")
rows,cols=df.shape
print("No of Rows: ",rows)
print("No of Columns: ",cols)

print(df.isnull().sum())

No of Rows:  2765
No of Columns:  17
index                                                                                                      0
Area_Name                                                                                                  0
Year                                                                                                       0
Group_Name                                                                                                 0
Sub_Group_Name                                                                                             0
Persons_Acquitted                                                                                          0
Persons_against_whom_cases_Compounded_or_Withdrawn                                                         0
Persons_Arrested                                                                                           0
Persons_Chargesheeted                                                                      

Result analysed: no Missing values

step 2: analysing numerical and categorical values

In [71]:
numerical = df.select_dtypes(include=[np.number]).columns
print("numerical: ",numerical)
categorical = df.select_dtypes(exclude=[np.number]).columns
print("categorical: ",categorical)

numerical:  Index(['index', 'Year', 'Persons_Acquitted',
       'Persons_against_whom_cases_Compounded_or_Withdrawn',
       'Persons_Arrested', 'Persons_Chargesheeted', 'Persons_Convicted',
       'Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_beginning',
       'Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_end',
       'Persons_in_Custody_or_on_Bail_during_Trial_at_Year_End',
       'Persons_Released_or_Freed_by_Police_or_Magistrate_before_Trial_for_want_of_evidence_or_any_other_reason',
       'Persons_Trial_Completed', 'Persons_under_Trial_at_Year_beginning',
       'Total_Persons_under_Trial'],
      dtype='object')
categorical:  Index(['Area_Name', 'Group_Name', 'Sub_Group_Name'], dtype='object')


In [72]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor



df = pd.read_csv('womencrimes.csv')

# Select numerical and categorical columns
numerical = df.select_dtypes(include=[np.number]).columns
categorical = df.select_dtypes(exclude=[np.number]).columns



# Label Encode categorical columns
le = LabelEncoder()
for col in categorical:
    df[col] = le.fit_transform(df[col])

# Scale numerical columns
scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])

# ⚡ IMPORTANT CORRECTION ⚡
# Now correctly assign data not column names
X = df[numerical.tolist() + categorical.tolist()]   # Features are all numerical columns
y = df['Total_Persons_under_Trial']  # Target column (you can change to other target if needed)

# Split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model
model = RandomForestRegressor()
model.fit(x_train, y_train)

# Get Feature Importances
importances = model.feature_importances_

# See feature importance
for feature, importance in zip(X.columns, importances):
    print(f"{feature}: {importance:.3f}")




index: 0.001
Year: 0.001
Persons_Acquitted: 0.002
Persons_against_whom_cases_Compounded_or_Withdrawn: 0.000
Persons_Arrested: 0.000
Persons_Chargesheeted: 0.000
Persons_Convicted: 0.000
Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_beginning: 0.000
Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_end: 0.000
Persons_in_Custody_or_on_Bail_during_Trial_at_Year_End: 0.018
Persons_Released_or_Freed_by_Police_or_Magistrate_before_Trial_for_want_of_evidence_or_any_other_reason: 0.000
Persons_Trial_Completed: 0.000
Persons_under_Trial_at_Year_beginning: 0.041
Total_Persons_under_Trial: 0.936
Area_Name: 0.000
Group_Name: 0.000
Sub_Group_Name: 0.000


In [73]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model.predict(x_test)
# Evaluate
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


MAE: 0.006817618793547741
RMSE: 0.005155368563523593
R² Score: 0.9962353547319237


evaluating the model again after removing the unwanted features based on feature importance

In [74]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df = pd.read_csv('womencrimes.csv')

# Select numerical and categorical columns
numerical = df.select_dtypes(include=[np.number]).columns
categorical = df.select_dtypes(exclude=[np.number]).columns

# Label Encode categorical columns
le = LabelEncoder()
for col in categorical:
    df[col] = le.fit_transform(df[col])

# Scale numerical columns
scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])

# Combine features
X = df[numerical.tolist() + categorical.tolist()]
y = df['Total_Persons_under_Trial']

# First Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train initial Random Forest
model = RandomForestRegressor(random_state=42)
model.fit(x_train, y_train)

# Get feature importances
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\n🎯 Feature Importances:")
print(feature_importance_df)

# Keep only important features (Importance > 0.001 for example)
important_features = feature_importance_df[feature_importance_df['Importance'] > 0.001]['Feature'].tolist()

print("\n✅ Keeping these important features:")
print(important_features)

# Now only select important features
X_important = X[important_features]

# Train-Test Split again but on important features only
x_train_imp, x_test_imp, y_train_imp, y_test_imp = train_test_split(X_important, y, test_size=0.2, random_state=42)

# Retrain model
model_imp = RandomForestRegressor(random_state=42)
model_imp.fit(x_train_imp, y_train_imp)

# Predict
y_pred_imp = model_imp.predict(x_test_imp)

# Evaluate
print("\n📊 Model Performance After Removing Unimportant Features:")
print("MAE:", mean_absolute_error(y_test_imp, y_pred_imp))
print("RMSE:", mean_squared_error(y_test_imp, y_pred_imp))
print("R² Score:", r2_score(y_test_imp, y_pred_imp))



🎯 Feature Importances:
                                              Feature  Importance
13                          Total_Persons_under_Trial    0.938882
12              Persons_under_Trial_at_Year_beginning    0.033899
9   Persons_in_Custody_or_on_Bail_during_Trial_at_...    0.022130
8   Persons_in_Custody_or_on_Bail_during_Investiga...    0.001149
7   Persons_in_Custody_or_on_Bail_during_Investiga...    0.000661
2                                   Persons_Acquitted    0.000620
1                                                Year    0.000392
0                                               index    0.000359
6                                   Persons_Convicted    0.000344
5                               Persons_Chargesheeted    0.000341
3   Persons_against_whom_cases_Compounded_or_Withd...    0.000312
4                                    Persons_Arrested    0.000280
11                            Persons_Trial_Completed    0.000255
10  Persons_Released_or_Freed_by_Police_or_Magistr..

Your model became better at predicting.
✅ It became simpler (fewer features, less noise).
✅ It became more accurate (errors dropped, R² went up).

Optimization

In [75]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df = pd.read_csv('womencrimes.csv')

# Select numerical and categorical columns
numerical = df.select_dtypes(include=[np.number]).columns
categorical = df.select_dtypes(exclude=[np.number]).columns

# Label Encode categorical columns
le = LabelEncoder()
for col in categorical:
    df[col] = le.fit_transform(df[col])

# Scale numerical columns
scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])

# Combine features
X = df[numerical.tolist() + categorical.tolist()]
y = df['Total_Persons_under_Trial']

# First Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train initial Random Forest
model = RandomForestRegressor(random_state=42)
model.fit(x_train, y_train)

# Get feature importances
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\n🎯 Feature Importances:")
print(feature_importance_df)

# Keep only important features (Importance > 0.001 for example)
important_features = feature_importance_df[feature_importance_df['Importance'] > 0.005]['Feature'].tolist()

print("\n✅ Keeping these important features:")
print(important_features)

# Now only select important features
X_important = X[important_features]

# Train-Test Split again but on important features only
x_train_imp, x_test_imp, y_train_imp, y_test_imp = train_test_split(X_important, y, test_size=0.2, random_state=42)

# Retrain model
model_imp = RandomForestRegressor(random_state=42)
model_imp.fit(x_train_imp, y_train_imp)

# Predict
y_pred_imp = model_imp.predict(x_test_imp)

# Evaluate
print("\n📊 Model Performance After Removing Unimportant Features:")
print("MAE:", mean_absolute_error(y_test_imp, y_pred_imp))
print("RMSE:", mean_squared_error(y_test_imp, y_pred_imp))
print("R² Score:", r2_score(y_test_imp, y_pred_imp))



🎯 Feature Importances:
                                              Feature  Importance
13                          Total_Persons_under_Trial    0.938882
12              Persons_under_Trial_at_Year_beginning    0.033899
9   Persons_in_Custody_or_on_Bail_during_Trial_at_...    0.022130
8   Persons_in_Custody_or_on_Bail_during_Investiga...    0.001149
7   Persons_in_Custody_or_on_Bail_during_Investiga...    0.000661
2                                   Persons_Acquitted    0.000620
1                                                Year    0.000392
0                                               index    0.000359
6                                   Persons_Convicted    0.000344
5                               Persons_Chargesheeted    0.000341
3   Persons_against_whom_cases_Compounded_or_Withd...    0.000312
4                                    Persons_Arrested    0.000280
11                            Persons_Trial_Completed    0.000255
10  Persons_Released_or_Freed_by_Police_or_Magistr..

Increased threshold value for finding important features and no change in optimization

To Predict Conviction

In [76]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load data
df = pd.read_csv('womencrimes.csv')

# Select numerical and categorical columns
numerical = df.select_dtypes(include=[np.number]).columns
categorical = df.select_dtypes(exclude=[np.number]).columns

# Label Encode categorical columns
le = LabelEncoder()
for col in categorical:
    df[col] = le.fit_transform(df[col])

# Scale numerical columns
scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])

# 🎯 Select Features related to arrest and trial
selected_features = [
    'Persons_Arrested',
    'Persons_Chargesheeted',
    'Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_beginning',
    'Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_end',
    'Persons_in_Custody_or_on_Bail_during_Trial_at_Year_End',
    'Persons_Released_or_Freed_by_Police_or_Magistrate_before_Trial_for_want_of_evidence_or_any_other_reason',
    'Persons_Trial_Completed',
    'Persons_under_Trial_at_Year_beginning',
    'Total_Persons_under_Trial'
]

# Features and Target
X = df[selected_features]
y = df['Persons_Convicted']

# Split data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model
model = RandomForestRegressor(random_state=42)
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(x_test)

# Evaluate
print("\n📊 Model Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# Optional: Feature Importances
importances = model.feature_importances_
print("\n🌟 Feature Importances:")
for feature, importance in zip(X.columns, importances):
    print(f"{feature}: {importance:.3f}")



📊 Model Performance:
MAE: 0.10249601644068261
RMSE: 0.38015812406927413
R² Score: 0.4770911910392851

🌟 Feature Importances:
Persons_Arrested: 0.022
Persons_Chargesheeted: 0.014
Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_beginning: 0.024
Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_end: 0.020
Persons_in_Custody_or_on_Bail_during_Trial_at_Year_End: 0.029
Persons_Released_or_Freed_by_Police_or_Magistrate_before_Trial_for_want_of_evidence_or_any_other_reason: 0.047
Persons_Trial_Completed: 0.802
Persons_under_Trial_at_Year_beginning: 0.023
Total_Persons_under_Trial: 0.020


Dynamic Feature Selection for random Forest regressor to predict conviction

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load data
df = pd.read_csv('womencrimes.csv')

# Select numerical and categorical columns
numerical = df.select_dtypes(include=[np.number]).columns
categorical = df.select_dtypes(exclude=[np.number]).columns

# Label Encode categorical columns
final_le = LabelEncoder()
for col in categorical:
    df[col] = final_le.fit_transform(df[col])

# Scale numerical columns
final_scaler = StandardScaler()
df[numerical] = final_scaler.fit_transform(df[numerical])

# 🎯 Select Initial Features related to arrest and trial
initial_features = [
    'Persons_Arrested',
    'Persons_Chargesheeted',
    'Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_beginning',
    'Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_end',
    'Persons_in_Custody_or_on_Bail_during_Trial_at_Year_End',
    'Persons_Released_or_Freed_by_Police_or_Magistrate_before_Trial_for_want_of_evidence_or_any_other_reason',
    'Persons_Trial_Completed',
    'Persons_under_Trial_at_Year_beginning',
    'Total_Persons_under_Trial'
]

X = df[initial_features]
y = df['Persons_Convicted']

# Step 1: Initial split and train
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

initial_model = RandomForestRegressor(random_state=42)
initial_model.fit(x_train, y_train)

# Feature importances
importances = initial_model.feature_importances_

# Create a dataframe to sort and view
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\n🌟 Initial Feature Importances:")
print(feature_importances)

# 🎯 Step 2: Select only important features (say Importance > 0.01)
important_features = feature_importances[feature_importances['Importance'] > 0.01]['Feature'].tolist()

print("\n🧠 Selected Important Features for Final Model:")
print(important_features)

# Prepare final data
X_final = df[important_features]

# Split again
x_train_final, x_test_final, y_train_final, y_test_final = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Final Model Training
final_model = RandomForestRegressor(random_state=42)
final_model.fit(x_train_final, y_train_final)

# Final Prediction
y_pred_final = final_model.predict(x_test_final)

# Evaluation
print("\n📊 Final Model Performance (after feature selection):")
print("MAE:", mean_absolute_error(y_test_final, y_pred_final))
print("RMSE:", mean_squared_error(y_test_final, y_pred_final))
print("R² Score:", r2_score(y_test_final, y_pred_final))



🌟 Initial Feature Importances:
                                             Feature  Importance
6                            Persons_Trial_Completed    0.802167
5  Persons_Released_or_Freed_by_Police_or_Magistr...    0.046781
4  Persons_in_Custody_or_on_Bail_during_Trial_at_...    0.028560
2  Persons_in_Custody_or_on_Bail_during_Investiga...    0.024022
7              Persons_under_Trial_at_Year_beginning    0.023387
0                                   Persons_Arrested    0.021554
8                          Total_Persons_under_Trial    0.019797
3  Persons_in_Custody_or_on_Bail_during_Investiga...    0.019597
1                              Persons_Chargesheeted    0.014136

🧠 Selected Important Features for Final Model:
['Persons_Trial_Completed', 'Persons_Released_or_Freed_by_Police_or_Magistrate_before_Trial_for_want_of_evidence_or_any_other_reason', 'Persons_in_Custody_or_on_Bail_during_Trial_at_Year_End', 'Persons_in_Custody_or_on_Bail_during_Investigation_at_Year_beginning', 'Pers

In [85]:
import pickle as pkl

pkl.dump(final_model, open('models/final_model.pkl', 'wb'))
pkl.dump(final_scaler, open('models/final_scaler.pkl', 'wb'))
pkl.dump(final_le, open('models/final_encoder.pkl', 'wb'))
pkl.dump(important_features, open('models/final_important_features.pkl', 'wb'))
pkl.dump(feature_importances, open('models/final_feature_importances.pkl', 'wb'))

Inference:
 average error got slightly smaller
RMSE Decreased
→ This shows that big errors (outliers) reduced too.
R² Score Increased: Your model explains more of the real-world behavior after cleaning the features.

Multiple Linear regression to Predict the number of persons under trial after applying PCA

In [79]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Load the Data
df = pd.read_csv('womencrimes.csv')

# 2. Preprocessing
# Separate numerical and categorical columns
numerical = df.select_dtypes(include=[np.number]).columns
categorical = df.select_dtypes(exclude=[np.number]).columns

# Label encode categorical features
le = LabelEncoder()
for col in categorical:
    df[col] = le.fit_transform(df[col])

# Scale the numerical features
scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])

# 3. Set up Features (X) and Target (y)
X = df.drop('Total_Persons_under_Trial', axis=1)  # Drop target from features
y = df['Total_Persons_under_Trial']               # Target: Total persons under trial

# 4. Apply PCA to reduce dimensionality
# 👉 Keep 95% variance (you can also specify components manually)
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)

print(f"Original features: {X.shape[1]}")
print(f"Reduced features after PCA: {X_pca.shape[1]}")

# 5. Train/Test Split
x_train, x_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# 6. Multiple Linear Regression
pca_model = LinearRegression()
pca_model.fit(x_train, y_train)

# 7. Prediction
pca_y_pred = pca_model.predict(x_test)

# 8. Evaluation
print("\n📊 Model Performance:")
print(f"MAE: {mean_absolute_error(y_test, pca_y_pred)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, pca_y_pred))}")
print(f"R² Score: {r2_score(y_test, pca_y_pred)}")


Original features: 16
Reduced features after PCA: 4

📊 Model Performance:
MAE: 0.12899603284023745
RMSE: 0.4232671165067254
R² Score: 0.8691741996451485


Inference:
✅ Random Forest with feature selection is performing MUCH better than
✅ Linear Regression after PCA.