In [1]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import re
import warnings
warnings.filterwarnings('ignore')

print("🚀 Starting Crime Classification Pipeline...")
print("=" * 50)

🚀 Starting Crime Classification Pipeline...


In [2]:
def load_and_merge_csv_files(folder_path):
    """
    Read all CSV files from the specified folder and merge them into one DataFrame
    """
    print(f"📂 Loading CSV files from: {folder_path}")
    
    # Get all CSV files in the crime folder
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    
    print(f"📄 Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"   - {os.path.basename(file)}")
    
    # Read and combine all CSV files
    dataframes = []
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            print(f"   ✅ Loaded {file}: {len(df)} rows")
            dataframes.append(df)
        except Exception as e:
            print(f"   ❌ Error loading {file}: {e}")
    
    # Merge all dataframes
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"🔗 Combined dataset shape: {combined_df.shape}")
    
    return combined_df

# Load the data - Update this path to match your system
crime_folder = r"C:\Users\ssk08\OneDrive\Desktop\NLP Project\Model 2\crime"
df = load_and_merge_csv_files(crime_folder)

# Display basic information about the dataset
print("\n📊 Dataset Overview:")
print(f"Total samples: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"Missing values:\n{df.isnull().sum()}")

# Check if required columns exist
required_columns = ['STATE/UT', 'DISTRICT']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Required column '{col}' not found in dataset")

# Display label distribution
print(f"\n🏷️ Crime Category Distribution:")
print(df['DISTRICT'].value_counts())

📂 Loading CSV files from: C:\Users\ssk08\OneDrive\Desktop\NLP Project\Model 2\crime
📄 Found 57 CSV files:
   - 01_District_wise_crimes_committed_IPC_2001_2012.csv
   - 01_District_wise_crimes_committed_IPC_2013.csv
   - 01_District_wise_crimes_committed_IPC_2014.csv
   - 02_01_District_wise_crimes_committed_against_SC_2001_2012.csv
   - 02_01_District_wise_crimes_committed_against_SC_2013.csv
   - 02_01_District_wise_crimes_committed_against_SC_2014.csv
   - 02_District_wise_crimes_committed_against_ST_2001_2012.csv
   - 02_District_wise_crimes_committed_against_ST_2013.csv
   - 02_District_wise_crimes_committed_against_ST_2014.csv
   - 03_District_wise_crimes_committed_against_children_2001_2012.csv
   - 03_District_wise_crimes_committed_against_children_2013.csv
   - 03_Persons_arrested_and_their_disposal_by_police_and_court_under_crime_against_children_2012.csv
   - 03_Persons_arrested_and_their_disposal_by_police_and_court_under_crime_against_children_2013.csv
   - 03_Persons_arres

In [3]:
def clean_text(text):
    """
    Clean and preprocess text data
    """
    if pd.isna(text):
        return ""
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove special characters and digits (keep only letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("\n🧹 Cleaning and preprocessing text data...")

# Clean the text column
df['text_cleaned'] = df['STATE/UT'].apply(clean_text)

# Remove empty text entries
df = df[df['text_cleaned'].str.len() > 0].reset_index(drop=True)

print(f"✅ After cleaning: {len(df)} samples remaining")

# Display sample data
print("\n📝 Sample data after cleaning:")
print(df[['text_cleaned', 'DISTRICT']].head(3).to_string())




🧹 Cleaning and preprocessing text data...
✅ After cleaning: 83417 samples remaining

📝 Sample data after cleaning:
     text_cleaned   DISTRICT
0  andhra pradesh   ADILABAD
1  andhra pradesh  ANANTAPUR
2  andhra pradesh   CHITTOOR


In [4]:
# Crime Classification using TF-IDF and Logistic Regression
# Complete script for data.ipynb

import pandas as pd
import numpy as np
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import re
import warnings
warnings.filterwarnings('ignore')

print("🚀 Starting Crime Classification Pipeline...")
print("=" * 50)

# ============================================================================
# STEP 1: READ AND MERGE ALL CSV FILES FROM crime/ FOLDER
# ============================================================================

def load_and_merge_csv_files(folder_path):
    """
    Read all CSV files from the specified folder and merge them into one DataFrame
    """
    print(f"📂 Loading CSV files from: {folder_path}")
    
    # Get all CSV files in the crime folder
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    
    print(f"📄 Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"   - {os.path.basename(file)}")
    
    # Read and combine all CSV files
    dataframes = []
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            print(f"   ✅ Loaded {file}: {len(df)} rows")
            dataframes.append(df)
        except Exception as e:
            print(f"   ❌ Error loading {file}: {e}")
    
    # Merge all dataframes
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"🔗 Combined dataset shape: {combined_df.shape}")
    
    return combined_df

# Load the data - Update this path to match your system
crime_folder = r"C:\Users\ssk08\OneDrive\Desktop\NLP Project\Model 2\crime"
df = load_and_merge_csv_files(crime_folder)

# Display basic information about the dataset
print("\n📊 Dataset Overview:")
print(f"Total samples: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"Missing values:\n{df.isnull().sum()}")

# Check if required columns exist
required_columns = ['STATE/UT', 'DISTRICT']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Required column '{col}' not found in dataset")

# Display label distribution
print(f"\n🏷️ Crime Category Distribution:")
print(df['DISTRICT'].value_counts())

# ============================================================================
# STEP 2: DATA PREPROCESSING AND CLEANING
# ============================================================================

def clean_text(text):
    """
    Clean and preprocess text data
    """
    if pd.isna(text):
        return ""
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove special characters and digits (keep only letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("\n🧹 Cleaning and preprocessing text data...")

# Clean the text column
df['text_cleaned'] = df['STATE/UT'].apply(clean_text)

# Remove rows with missing values in key columns
print(f"🔍 Before cleaning - Total samples: {len(df)}")
print(f"Missing values in STATE/UT: {df['STATE/UT'].isnull().sum()}")
print(f"Missing values in DISTRICT: {df['DISTRICT'].isnull().sum()}")

# Drop rows where either STATE/UT or DISTRICT is missing
df = df.dropna(subset=['STATE/UT', 'DISTRICT']).reset_index(drop=True)

# Remove empty text entries after cleaning
df = df[df['text_cleaned'].str.len() > 0].reset_index(drop=True)

print(f"✅ After cleaning: {len(df)} samples remaining")

# Check if we still have enough data
if len(df) == 0:
    raise ValueError("No valid data remaining after cleaning!")

# Check class distribution after cleaning
print(f"📊 Districts after cleaning: {df['DISTRICT'].nunique()} unique districts")
district_counts = df['DISTRICT'].value_counts()
print(f"District distribution:\n{district_counts}")

# Filter out districts with very few samples (less than 2) to avoid stratify issues
min_samples = 2
districts_to_keep = district_counts[district_counts >= min_samples].index
df = df[df['DISTRICT'].isin(districts_to_keep)].reset_index(drop=True)

print(f"📊 After filtering rare districts: {len(df)} samples with {df['DISTRICT'].nunique()} districts")

# Display sample data
print("\n📝 Sample data after cleaning:")
print(df[['text_cleaned', 'DISTRICT']].head(3).to_string())

# ============================================================================
# STEP 3: SPLIT DATASET INTO TRAINING AND TESTING SETS
# ============================================================================

print("\n✂️ Splitting dataset into training and testing sets...")

# Prepare features (X) and target (y)
X = df['text_cleaned']  # Using STATE/UT as the text feature
y = df['DISTRICT']      # Using DISTRICT as the target label

# Verify no missing values remain
print(f"🔍 Final check - Missing values in X: {X.isnull().sum()}")
print(f"🔍 Final check - Missing values in y: {y.isnull().sum()}")

# Check if we have enough samples for each class for stratified split
y_counts = y.value_counts()
min_class_count = y_counts.min()
print(f"📊 Minimum samples per district: {min_class_count}")

# Adjust stratify parameter based on minimum class count
if min_class_count >= 2:
    stratify_param = y
    print("✅ Using stratified split")
else:
    stratify_param = None
    print("⚠️ Using random split (some districts have too few samples for stratified split)")

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=stratify_param  # Use stratified split only if possible
)

print(f"📈 Training set size: {len(X_train)} samples")
print(f"📉 Testing set size: {len(X_test)} samples")
print(f"🎯 Number of unique districts: {len(y.unique())}")

# ============================================================================
# STEP 4: TEXT VECTORIZATION USING TF-IDF
# ============================================================================

print("\n🔤 Vectorizing text data using TF-IDF...")

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=10000,     # Limit to top 10,000 features
    ngram_range=(1, 2),     # Use both unigrams and bigrams
    min_df=2,               # Ignore terms that appear in less than 2 documents
    max_df=0.95,            # Ignore terms that appear in more than 95% of documents
    stop_words='english'    # Remove common English stop words
)

# Fit and transform training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform test data (don't fit again!)
X_test_tfidf = tfidf.transform(X_test)

print(f"✅ TF-IDF vectorization completed")
print(f"📊 Training features shape: {X_train_tfidf.shape}")
print(f"📊 Testing features shape: {X_test_tfidf.shape}")
print(f"📚 Vocabulary size: {len(tfidf.vocabulary_)}")

# ============================================================================
# STEP 5: TRAIN LOGISTIC REGRESSION CLASSIFIER
# ============================================================================

print("\n🤖 Training Logistic Regression classifier...")

# Initialize and train the classifier
lr_classifier = LogisticRegression(
    random_state=42,
    max_iter=1000,          # Increase max iterations for convergence
    C=1.0                   # Regularization strength
)

# Train the model
lr_classifier.fit(X_train_tfidf, y_train)

print("✅ Model training completed!")

# ============================================================================
# STEP 6: MODEL EVALUATION
# ============================================================================

print("\n📈 Evaluating model performance...")

# Make predictions on test set
y_pred = lr_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"🎯 Model Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Generate detailed classification report
print(f"\n📋 Detailed Classification Report:")
print("=" * 60)
print(classification_report(y_test, y_pred))

# Display confusion matrix
print(f"\n🔍 Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# Show prediction probabilities for a few test samples
print(f"\n🎲 Sample Predictions with Confidence:")
sample_indices = np.random.choice(len(X_test), 3, replace=False)
for idx in sample_indices:
    sample_text = X_test.iloc[idx]
    true_label = y_test.iloc[idx]
    predicted_label = y_pred[idx]
    
    # Get prediction probabilities
    probabilities = lr_classifier.predict_proba(X_test_tfidf[idx])
    max_prob = np.max(probabilities)
    
    print(f"\nSample {idx + 1}:")
    print(f"Text: '{sample_text[:100]}...'")
    print(f"True Label: {true_label}")
    print(f"Predicted: {predicted_label} (Confidence: {max_prob:.3f})")
    print("-" * 50)

# ============================================================================
# STEP 7: TEST MODEL ON CUSTOM CASE DESCRIPTIONS
# ============================================================================

def predict_crime_category(text_input, model, vectorizer):
    """
    Predict crime category for a custom text input
    """
    # Clean the input text
    cleaned_text = clean_text(text_input)
    
    # Vectorize the text
    text_tfidf = vectorizer.transform([cleaned_text])
    
    # Make prediction
    prediction = model.predict(text_tfidf)[0]
    probabilities = model.predict_proba(text_tfidf)[0]
    confidence = np.max(probabilities)
    
    # Get all class probabilities
    classes = model.classes_
    prob_dict = dict(zip(classes, probabilities))
    
    return prediction, confidence, prob_dict

print("\n🔮 Testing model on custom case descriptions:")
print("=" * 60)

# Sample custom case descriptions for testing
test_cases = [
    "A person broke into a house at night and stole valuable items including jewelry and electronics.",
    "The defendant was found driving under the influence of alcohol with a blood alcohol level of 0.12.",
    "An individual was caught selling illegal drugs to minors near a school playground.",
    "The accused physically attacked another person during an argument, causing serious injuries.",
    "Someone created fake documents to claim insurance money for a car accident that never happened."
]

for i, test_case in enumerate(test_cases, 1):
    print(f"\n🔍 Test Case {i}:")
    print(f"Input: '{test_case}'")
    
    prediction, confidence, probabilities = predict_crime_category(
        test_case, lr_classifier, tfidf
    )
    
    print(f"Predicted Crime: {prediction}")
    print(f"Confidence: {confidence:.3f}")
    
    # Show top 3 most likely categories
    sorted_probs = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
    print("Top 3 predictions:")
    for j, (crime_type, prob) in enumerate(sorted_probs[:3], 1):
        print(f"  {j}. {crime_type}: {prob:.3f}")
    print("-" * 50)

# ============================================================================
# STEP 8: INTERACTIVE PREDICTION FUNCTION
# ============================================================================

def interactive_crime_prediction():
    """
    Interactive function to test custom inputs
    """
    print("\n🎯 Interactive Crime Prediction")
    print("Enter a case description (or 'quit' to exit):")
    
    while True:
        user_input = input("\n📝 Case Description: ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        
        if len(user_input.strip()) == 0:
            print("❌ Please enter a valid case description.")
            continue
        
        try:
            prediction, confidence, probabilities = predict_crime_category(
                user_input, lr_classifier, tfidf
            )
            
            print(f"\n🎯 Prediction Results:")
            print(f"Predicted Crime Category: {prediction}")
            print(f"Confidence Score: {confidence:.3f}")
            
            # Show all probabilities
            print(f"\nAll Category Probabilities:")
            sorted_probs = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
            for crime_type, prob in sorted_probs:
                print(f"  {crime_type}: {prob:.3f}")
        
        except Exception as e:
            print(f"❌ Error making prediction: {e}")

print(f"\n✅ Crime Classification Pipeline Completed!")
print(f"📊 Final Model Summary:")
print(f"   - Dataset Size: {len(df)} cases")
print(f"   - Crime Categories: {len(y.unique())}")
print(f"   - Model Accuracy: {accuracy:.4f}")
print(f"   - TF-IDF Features: {len(tfidf.vocabulary_)}")

# Uncomment the line below to run interactive predictions
# interactive_crime_prediction()

🚀 Starting Crime Classification Pipeline...
📂 Loading CSV files from: C:\Users\ssk08\OneDrive\Desktop\NLP Project\Model 2\crime
📄 Found 57 CSV files:
   - 01_District_wise_crimes_committed_IPC_2001_2012.csv
   - 01_District_wise_crimes_committed_IPC_2013.csv
   - 01_District_wise_crimes_committed_IPC_2014.csv
   - 02_01_District_wise_crimes_committed_against_SC_2001_2012.csv
   - 02_01_District_wise_crimes_committed_against_SC_2013.csv
   - 02_01_District_wise_crimes_committed_against_SC_2014.csv
   - 02_District_wise_crimes_committed_against_ST_2001_2012.csv
   - 02_District_wise_crimes_committed_against_ST_2013.csv
   - 02_District_wise_crimes_committed_against_ST_2014.csv
   - 03_District_wise_crimes_committed_against_children_2001_2012.csv
   - 03_District_wise_crimes_committed_against_children_2013.csv
   - 03_Persons_arrested_and_their_disposal_by_police_and_court_under_crime_against_children_2012.csv
   - 03_Persons_arrested_and_their_disposal_by_police_and_court_under_crime_aga

In [5]:
# Crime Classification using TF-IDF and Logistic Regression
# Complete script for data.ipynb - Updated for Crime Statistics Data

import pandas as pd
import numpy as np
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import re
import warnings
warnings.filterwarnings('ignore')

print("🚀 Starting Crime Classification Pipeline...")
print("📊 This script will predict STATE/UT based on district crime patterns")
print("=" * 70)

# ============================================================================
# STEP 1: READ AND MERGE ALL CSV FILES FROM crime/ FOLDER
# ============================================================================

def load_and_merge_csv_files(folder_path):
    """
    Read all CSV files from the specified folder and merge them into one DataFrame
    """
    print(f"📂 Loading CSV files from: {folder_path}")
    
    # Get all CSV files in the crime folder
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    
    print(f"📄 Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"   - {os.path.basename(file)}")
    
    # Read and combine all CSV files
    dataframes = []
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            print(f"   ✅ Loaded {file}: {len(df)} rows")
            dataframes.append(df)
        except Exception as e:
            print(f"   ❌ Error loading {file}: {e}")
    
    # Merge all dataframes
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"🔗 Combined dataset shape: {combined_df.shape}")
    
    return combined_df

# Load the data - Update this path to match your system
crime_folder = r"C:\Users\ssk08\OneDrive\Desktop\NLP Project\Model 2\crime"
df = load_and_merge_csv_files(crime_folder)

# Display basic information about the dataset
print("\n📊 Dataset Overview:")
print(f"Total samples: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"Missing values:\n{df.isnull().sum()}")

# Check if required columns exist
required_columns = ['STATE/UT', 'DISTRICT']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Required column '{col}' not found in dataset")

# Display label distribution
print(f"\n🏷️ State Distribution:")
print(df['STATE/UT'].value_counts())

print(f"\n🏷️ District Distribution (Top 10):")
print(df['DISTRICT'].value_counts().head(10))

# Get all crime-related columns (exclude STATE/UT, DISTRICT, YEAR)
crime_columns = [col for col in df.columns if col not in ['STATE/UT', 'DISTRICT', 'YEAR']]
print(f"\n📊 Crime Categories Found: {len(crime_columns)}")
print("Sample crime columns:", crime_columns[:5])

# ============================================================================
# STEP 2: DATA PREPROCESSING AND FEATURE ENGINEERING
# ============================================================================

print("\n🧹 Preprocessing and preparing features...")

# Remove rows with missing values in key columns
print(f"🔍 Before cleaning - Total samples: {len(df)}")
print(f"Missing values in STATE/UT: {df['STATE/UT'].isnull().sum()}")
print(f"Missing values in DISTRICT: {df['DISTRICT'].isnull().sum()}")

# Drop rows where either STATE/UT or DISTRICT is missing
df = df.dropna(subset=['STATE/UT', 'DISTRICT']).reset_index(drop=True)

# Fill missing values in crime columns with 0 (assuming missing = no crimes reported)
for col in crime_columns:
    df[col] = df[col].fillna(0)

print(f"✅ After cleaning: {len(df)} samples remaining")

# Check if we still have enough data
if len(df) == 0:
    raise ValueError("No valid data remaining after cleaning!")

# Create a text representation of crime data for each district
def create_crime_description(row):
    """
    Create a text description of crime patterns for TF-IDF processing
    """
    descriptions = []
    
    # Add district name
    descriptions.append(f"district_{row['DISTRICT'].lower().replace(' ', '_')}")
    
    # Add crime patterns - mention each crime type multiple times based on frequency
    for col in crime_columns:
        crime_count = int(row[col]) if not pd.isna(row[col]) else 0
        if crime_count > 0:
            # Normalize crime name for text processing
            crime_name = col.lower().replace(' ', '_').replace('/', '_').replace('&', 'and')
            
            # Add crime type multiple times based on scaled frequency
            # Scale down large numbers to avoid overwhelming the text
            scaled_count = min(max(1, int(crime_count / 10)), 50)  # Scale and cap
            descriptions.extend([crime_name] * scaled_count)
    
    return ' '.join(descriptions)

print("🔤 Creating text representations of crime patterns...")
df['crime_text'] = df.apply(create_crime_description, axis=1)

# Check class distribution
print(f"📊 States after cleaning: {df['STATE/UT'].nunique()} unique states")
state_counts = df['STATE/UT'].value_counts()
print(f"State distribution:\n{state_counts}")

# Filter out states with very few samples (less than 2) to avoid stratify issues
min_samples = 2
states_to_keep = state_counts[state_counts >= min_samples].index
df = df[df['STATE/UT'].isin(states_to_keep)].reset_index(drop=True)

print(f"📊 After filtering: {len(df)} samples with {df['STATE/UT'].nunique()} states")

# Display sample data
print("\n📝 Sample crime text representations:")
for i in range(min(3, len(df))):
    print(f"State: {df.iloc[i]['STATE/UT']}")
    print(f"District: {df.iloc[i]['DISTRICT']}")
    print(f"Crime Text: {df.iloc[i]['crime_text'][:200]}...")
    print("-" * 50)

# ============================================================================
# STEP 3: SPLIT DATASET INTO TRAINING AND TESTING SETS
# ============================================================================

print("\n✂️ Splitting dataset into training and testing sets...")

# Prepare features (X) and target (y)
X = df['crime_text']    # Using crime pattern text as features
y = df['STATE/UT']      # Using STATE/UT as the target label

# Verify no missing values remain
print(f"🔍 Final check - Missing values in X: {X.isnull().sum()}")
print(f"🔍 Final check - Missing values in y: {y.isnull().sum()}")

# Check if we have enough samples for each class for stratified split
y_counts = y.value_counts()
min_class_count = y_counts.min()
print(f"📊 Minimum samples per state: {min_class_count}")

# Adjust stratify parameter based on minimum class count
if min_class_count >= 2:
    stratify_param = y
    print("✅ Using stratified split")
else:
    stratify_param = None
    print("⚠️ Using random split (some states have too few samples for stratified split)")

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=stratify_param  # Use stratified split only if possible
)

print(f"📈 Training set size: {len(X_train)} samples")
print(f"📉 Testing set size: {len(X_test)} samples")
print(f"🎯 Number of unique states: {len(y.unique())}")

# ============================================================================
# STEP 4: TEXT VECTORIZATION USING TF-IDF
# ============================================================================

print("\n🔤 Vectorizing text data using TF-IDF...")

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=10000,     # Limit to top 10,000 features
    ngram_range=(1, 2),     # Use both unigrams and bigrams
    min_df=2,               # Ignore terms that appear in less than 2 documents
    max_df=0.95,            # Ignore terms that appear in more than 95% of documents
    stop_words='english'    # Remove common English stop words
)

# Fit and transform training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform test data (don't fit again!)
X_test_tfidf = tfidf.transform(X_test)

print(f"✅ TF-IDF vectorization completed")
print(f"📊 Training features shape: {X_train_tfidf.shape}")
print(f"📊 Testing features shape: {X_test_tfidf.shape}")
print(f"📚 Vocabulary size: {len(tfidf.vocabulary_)}")

# ============================================================================
# STEP 5: TRAIN LOGISTIC REGRESSION CLASSIFIER
# ============================================================================

print("\n🤖 Training Logistic Regression classifier...")

# Initialize and train the classifier
lr_classifier = LogisticRegression(
    random_state=42,
    max_iter=1000,          # Increase max iterations for convergence
    C=1.0                   # Regularization strength
)

# Train the model
lr_classifier.fit(X_train_tfidf, y_train)

print("✅ Model training completed!")

# ============================================================================
# STEP 6: MODEL EVALUATION
# ============================================================================

print("\n📈 Evaluating model performance...")

# Make predictions on test set
y_pred = lr_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"🎯 Model Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Generate detailed classification report
print(f"\n📋 Detailed Classification Report:")
print("=" * 60)
print(classification_report(y_test, y_pred))

# Display confusion matrix
print(f"\n🔍 Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# Show prediction probabilities for a few test samples
print(f"\n🎲 Sample Predictions with Confidence:")
sample_indices = np.random.choice(len(X_test), 3, replace=False)
for idx in sample_indices:
    sample_text = X_test.iloc[idx]
    true_label = y_test.iloc[idx]
    predicted_label = y_pred[idx]
    
    # Get prediction probabilities
    probabilities = lr_classifier.predict_proba(X_test_tfidf[idx])
    max_prob = np.max(probabilities)
    
    print(f"\nSample {idx + 1}:")
    print(f"Text: '{sample_text[:100]}...'")
    print(f"True Label: {true_label}")
    print(f"Predicted: {predicted_label} (Confidence: {max_prob:.3f})")
    print("-" * 50)

# ============================================================================
# STEP 7: TEST MODEL ON CUSTOM CRIME PATTERNS
# ============================================================================

def predict_state_from_crimes(crime_pattern_text, model, vectorizer):
    """
    Predict state from a crime pattern description
    """
    # Vectorize the text
    text_tfidf = vectorizer.transform([crime_pattern_text])
    
    # Make prediction
    prediction = model.predict(text_tfidf)[0]
    probabilities = model.predict_proba(text_tfidf)[0]
    confidence = np.max(probabilities)
    
    # Get all class probabilities
    classes = model.classes_
    prob_dict = dict(zip(classes, probabilities))
    
    return prediction, confidence, prob_dict

print("\n🔮 Testing model on custom crime patterns:")
print("=" * 60)

# Sample custom crime patterns for testing
test_cases = [
    "district_mumbai murder murder robbery theft theft burglary cheating fraud",
    "district_delhi rape kidnapping_abduction assault theft auto_theft riots",
    "district_chennai dowry_deaths cruelty_by_husband murder hurt_grevious_hurt theft",
    "district_kolkata theft burglary robbery cheating counterfieting arson",
    "district_bangalore cyber_crimes cheating fraud theft auto_theft riots"
]

for i, test_case in enumerate(test_cases, 1):
    print(f"\n🔍 Test Case {i}:")
    print(f"Crime Pattern: '{test_case}'")
    
    prediction, confidence, probabilities = predict_state_from_crimes(
        test_case, lr_classifier, tfidf
    )
    
    print(f"Predicted State: {prediction}")
    print(f"Confidence: {confidence:.3f}")
    
    # Show top 3 most likely states
    sorted_probs = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
    print("Top 3 predictions:")
    for j, (state, prob) in enumerate(sorted_probs[:3], 1):
        print(f"  {j}. {state}: {prob:.3f}")
    print("-" * 50)

# ============================================================================
# STEP 8: INTERACTIVE PREDICTION FUNCTION
# ============================================================================

def interactive_state_prediction():
    """
    Interactive function to test custom crime patterns
    """
    print("\n🎯 Interactive State Prediction from Crime Patterns")
    print("Enter crime types and counts (e.g., 'murder theft robbery burglary') or 'quit' to exit:")
    
    while True:
        user_input = input("\n📝 Crime Pattern: ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        
        if len(user_input.strip()) == 0:
            print("❌ Please enter a valid crime pattern.")
            continue
        
        try:
            prediction, confidence, probabilities = predict_state_from_crimes(
                user_input, lr_classifier, tfidf
            )
            
            print(f"\n🎯 Prediction Results:")
            print(f"Predicted State: {prediction}")
            print(f"Confidence Score: {confidence:.3f}")
            
            # Show all probabilities
            print(f"\nAll State Probabilities:")
            sorted_probs = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
            for state, prob in sorted_probs[:10]:  # Show top 10
                print(f"  {state}: {prob:.3f}")
        
        except Exception as e:
            print(f"❌ Error making prediction: {e}")

print(f"\n✅ Crime Classification Pipeline Completed!")
print(f"📊 Final Model Summary:")
print(f"   - Dataset Size: {len(df)} records")
print(f"   - States/UTs: {len(y.unique())}")
print(f"   - Model Accuracy: {accuracy:.4f}")
print(f"   - TF-IDF Features: {len(tfidf.vocabulary_)}")
print(f"   - Task: Predict State/UT from crime pattern data")

# Uncomment the line below to run interactive predictions
# interactive_state_prediction()

🚀 Starting Crime Classification Pipeline...
📊 This script will predict STATE/UT based on district crime patterns
📂 Loading CSV files from: C:\Users\ssk08\OneDrive\Desktop\NLP Project\Model 2\crime
📄 Found 57 CSV files:
   - 01_District_wise_crimes_committed_IPC_2001_2012.csv
   - 01_District_wise_crimes_committed_IPC_2013.csv
   - 01_District_wise_crimes_committed_IPC_2014.csv
   - 02_01_District_wise_crimes_committed_against_SC_2001_2012.csv
   - 02_01_District_wise_crimes_committed_against_SC_2013.csv
   - 02_01_District_wise_crimes_committed_against_SC_2014.csv
   - 02_District_wise_crimes_committed_against_ST_2001_2012.csv
   - 02_District_wise_crimes_committed_against_ST_2013.csv
   - 02_District_wise_crimes_committed_against_ST_2014.csv
   - 03_District_wise_crimes_committed_against_children_2001_2012.csv
   - 03_District_wise_crimes_committed_against_children_2013.csv
   - 03_Persons_arrested_and_their_disposal_by_police_and_court_under_crime_against_children_2012.csv
   - 03_Pe

In [6]:
# Add this after loading data to standardize state names
df['STATE/UT'] = df['STATE/UT'].str.upper().str.strip()
df = df[~df['STATE/UT'].str.contains('TOTAL|Total', na=False)]

In [7]:
# Focus on major crime categories instead of all 656 columns
major_crimes = ['MURDER', 'RAPE', 'KIDNAPPING & ABDUCTION', 'DACOITY', 
                'ROBBERY', 'BURGLARY', 'THEFT', 'RIOTS', 'CHEATING']

In [8]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
lr_classifier = LogisticRegression(class_weight='balanced')

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced')

In [None]:
interactive_state_prediction(
)
# Then test with: "murder theft robbery burglary riots"


🎯 Interactive State Prediction from Crime Patterns
Enter crime types and counts (e.g., 'murder theft robbery burglary') or 'quit' to exit:
❌ Error making prediction: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.


In [1]:
# Enhanced Crime Classification using TF-IDF and Multiple Models
# Improved version addressing data quality and performance issues

import pandas as pd
import numpy as np
import os
import glob
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

print("🚀 Enhanced Crime Classification Pipeline...")
print("📊 Improvements: Data cleaning, Feature selection, Model comparison")
print("=" * 70)

# ============================================================================
# STEP 1: IMPROVED DATA LOADING AND CLEANING
# ============================================================================

def load_and_merge_csv_files(folder_path):
    """Enhanced CSV loading with better error handling"""
    print(f"📂 Loading CSV files from: {folder_path}")
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    
    print(f"📄 Found {len(csv_files)} CSV files")
    dataframes = []
    
    for file in csv_files:
        try:
            df = pd.read_csv(file, low_memory=False)
            if len(df) > 0:  # Only add non-empty dataframes
                dataframes.append(df)
                print(f"   ✅ Loaded {os.path.basename(file)}: {len(df)} rows")
        except Exception as e:
            print(f"   ⚠️ Skipped {os.path.basename(file)}: {e}")
    
    if not dataframes:
        raise ValueError("No valid dataframes loaded")
    
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"🔗 Combined dataset: {combined_df.shape}")
    return combined_df

def clean_and_standardize_data(df):
    """Enhanced data cleaning with standardization"""
    print("\n🧹 Enhanced data cleaning and standardization...")
    
    # Standardize state names
    if 'STATE/UT' in df.columns:
        df['STATE/UT'] = df['STATE/UT'].astype(str).str.upper().str.strip()
        # Remove totals and summary rows
        df = df[~df['STATE/UT'].str.contains('TOTAL|Total|ALL-INDIA|All-India', na=False)]
    
    # Standardize district names
    if 'DISTRICT' in df.columns:
        df['DISTRICT'] = df['DISTRICT'].astype(str).str.upper().str.strip()
        df = df[~df['DISTRICT'].str.contains('TOTAL|Total', na=False)]
    
    # Remove rows with missing key columns
    before_count = len(df)
    df = df.dropna(subset=['STATE/UT', 'DISTRICT']).reset_index(drop=True)
    after_count = len(df)
    
    print(f"📊 Removed {before_count - after_count} rows with missing STATE/DISTRICT")
    print(f"📊 Remaining samples: {after_count}")
    
    return df

def select_major_crime_features(df):
    """Select only major crime categories for better feature quality"""
    print("\n🎯 Selecting major crime features...")
    
    # Define major crime categories (most common and interpretable)
    major_crime_patterns = [
        'MURDER', 'RAPE', 'KIDNAPPING', 'DACOITY', 'ROBBERY', 
        'BURGLARY', 'THEFT', 'RIOTS', 'CHEATING', 'ARSON',
        'HURT', 'DOWRY', 'ASSAULT', 'FRAUD', 'EXTORTION'
    ]
    
    # Find columns that match major crime patterns
    selected_columns = []
    for col in df.columns:
        col_upper = col.upper()
        if any(crime in col_upper for crime in major_crime_patterns):
            # Skip very specific subcategories to reduce noise
            if not any(skip in col_upper for skip in ['TOTAL', 'GRAND', 'SECTION']):
                selected_columns.append(col)
    
    print(f"📋 Selected {len(selected_columns)} major crime columns")
    print(f"Sample columns: {selected_columns[:5]}")
    
    return selected_columns

# Load and clean data
crime_folder = r"C:\Users\ssk08\OneDrive\Desktop\NLP Project\Model 2\crime"
df = load_and_merge_csv_files(crime_folder)
df = clean_and_standardize_data(df)

# Check data quality
print(f"\n📊 Data Quality Check:")
print(f"Unique states: {df['STATE/UT'].nunique()}")
print(f"Unique districts: {df['DISTRICT'].nunique()}")

# Filter out states with too few samples for reliable training
min_samples_per_state = 50  # Increased threshold
state_counts = df['STATE/UT'].value_counts()
valid_states = state_counts[state_counts >= min_samples_per_state].index
df = df[df['STATE/UT'].isin(valid_states)].reset_index(drop=True)

print(f"📊 After filtering (min {min_samples_per_state} samples): {len(df)} samples, {df['STATE/UT'].nunique()} states")
print(f"Top states: {list(df['STATE/UT'].value_counts().head().index)}")

# ============================================================================
# STEP 2: ENHANCED FEATURE ENGINEERING
# ============================================================================

print("\n🔧 Enhanced feature engineering...")

# Select major crime features
crime_columns = select_major_crime_features(df)

# Fill missing values with 0 for crime counts
for col in crime_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

def create_enhanced_crime_description(row):
    """Create more sophisticated crime pattern descriptions"""
    descriptions = []
    
    # Add district identifier
    district = str(row['DISTRICT']).lower().replace(' ', '_').replace('.', '').replace('(', '').replace(')', '')
    descriptions.append(f"district_{district}")
    
    # Add year if available
    if 'YEAR' in row and not pd.isna(row['YEAR']):
        year = int(row['YEAR'])
        if year < 2005:
            descriptions.append("period_early_2000s")
        elif year < 2010:
            descriptions.append("period_mid_2000s")
        else:
            descriptions.append("period_recent")
    
    # Process crime counts with better scaling
    crime_totals = []
    for col in crime_columns:
        if col in row and not pd.isna(row[col]):
            count = max(0, int(row[col]))
            if count > 0:
                crime_totals.append((col, count))
    
    # Sort crimes by frequency to emphasize major patterns
    crime_totals.sort(key=lambda x: x[1], reverse=True)
    
    # Add crime patterns with intelligent scaling
    total_crimes = sum(count for _, count in crime_totals) or 1
    
    for crime_type, count in crime_totals:
        if count > 0:
            # Normalize crime name
            crime_name = crime_type.lower().replace(' ', '_').replace('/', '_').replace('&', 'and')
            
            # Calculate relative frequency
            frequency_ratio = count / total_crimes
            
            # Scale based on both absolute count and relative frequency
            if frequency_ratio > 0.3:  # Very dominant crime
                repetitions = min(20, max(5, int(count / 50)))
            elif frequency_ratio > 0.1:  # Significant crime
                repetitions = min(10, max(2, int(count / 100)))
            elif count > 10:  # Notable crime
                repetitions = min(5, max(1, int(count / 200)))
            else:  # Minor crime
                repetitions = 1
            
            descriptions.extend([f"crime_{crime_name}"] * repetitions)
    
    return ' '.join(descriptions)

print("🔤 Creating enhanced text representations...")
df['crime_text'] = df.apply(create_enhanced_crime_description, axis=1)

# Remove empty crime descriptions
df = df[df['crime_text'].str.len() > 10].reset_index(drop=True)
print(f"📊 Final dataset: {len(df)} samples")

# Display sample enhanced descriptions
print(f"\n📝 Sample enhanced crime descriptions:")
for i in range(min(3, len(df))):
    print(f"State: {df.iloc[i]['STATE/UT']}")
    print(f"District: {df.iloc[i]['DISTRICT']}")
    print(f"Enhanced Text: {df.iloc[i]['crime_text'][:150]}...")
    print("-" * 50)

# ============================================================================
# STEP 3: BALANCED DATA SPLITTING
# ============================================================================

print(f"\n✂️ Enhanced data splitting...")

X = df['crime_text']
y = df['STATE/UT']

# Check class distribution
y_counts = y.value_counts()
print(f"📊 Class distribution stats:")
print(f"   Most samples: {y_counts.iloc[0]} ({y_counts.index[0]})")
print(f"   Least samples: {y_counts.iloc[-1]} ({y_counts.index[-1]})")
print(f"   Mean samples per state: {y_counts.mean():.1f}")

# Split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"📈 Training set: {len(X_train)} samples")
print(f"📉 Test set: {len(X_test)} samples")
print(f"🎯 States to predict: {len(y.unique())}")

# ============================================================================
# STEP 4: OPTIMIZED TF-IDF VECTORIZATION
# ============================================================================

print(f"\n🔤 Optimized TF-IDF vectorization...")

# Optimized TF-IDF parameters
tfidf = TfidfVectorizer(
    max_features=5000,      # Increased feature limit
    ngram_range=(1, 3),     # Include trigrams for better patterns
    min_df=3,               # Require terms in at least 3 documents
    max_df=0.8,             # Remove very common terms
    stop_words='english',
    sublinear_tf=True,      # Use sublinear term frequency
    norm='l2'               # L2 normalization
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"✅ TF-IDF completed")
print(f"📊 Feature matrix: {X_train_tfidf.shape}")
print(f"📚 Vocabulary size: {len(tfidf.vocabulary_)}")

# ============================================================================
# STEP 5: MODEL COMPARISON WITH HYPERPARAMETER TUNING
# ============================================================================

print(f"\n🤖 Training and comparing multiple models...")

# Calculate class weights for handling imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

models = {
    'Logistic Regression': LogisticRegression(
        random_state=42, max_iter=1000, class_weight='balanced', C=1.0
    ),
    'Random Forest': RandomForestClassifier(
        random_state=42, n_estimators=100, class_weight='balanced', max_depth=20
    ),
    'SVM (Linear)': SVC(
        random_state=42, kernel='linear', class_weight='balanced', probability=True
    )
}

model_results = {}

for model_name, model in models.items():
    print(f"\n🔄 Training {model_name}...")
    
    # Train model
    model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    model_results[model_name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    print(f"✅ {model_name} Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Find best model
best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['accuracy'])
best_model = model_results[best_model_name]['model']
best_accuracy = model_results[best_model_name]['accuracy']
best_predictions = model_results[best_model_name]['predictions']

print(f"\n🏆 Best Model: {best_model_name} with {best_accuracy:.4f} accuracy")

# ============================================================================
# STEP 6: ENHANCED EVALUATION
# ============================================================================

print(f"\n📈 Detailed evaluation of best model ({best_model_name})...")

# Classification report
print(f"\n📋 Classification Report:")
print("=" * 60)
print(classification_report(y_test, best_predictions))

# Top performing states analysis
print(f"\n🔍 Model Performance Analysis:")
report_dict = classification_report(y_test, best_predictions, output_dict=True)

# Get F1 scores for each state
state_f1_scores = [(state, scores['f1-score']) for state, scores in report_dict.items() 
                   if isinstance(scores, dict) and 'f1-score' in scores]
state_f1_scores.sort(key=lambda x: x[1], reverse=True)

print(f"📊 Top 5 Best Predicted States:")
for i, (state, f1) in enumerate(state_f1_scores[:5], 1):
    print(f"   {i}. {state}: F1-Score = {f1:.3f}")

print(f"\n📊 Bottom 5 States (Need Improvement):")
for i, (state, f1) in enumerate(state_f1_scores[-5:], 1):
    print(f"   {i}. {state}: F1-Score = {f1:.3f}")

# ============================================================================
# STEP 7: ENHANCED TESTING WITH REALISTIC EXAMPLES
# ============================================================================

def predict_state_from_crime_pattern(crime_text, model, vectorizer):
    """Enhanced prediction function"""
    text_tfidf = vectorizer.transform([crime_text])
    prediction = model.predict(text_tfidf)[0]
    
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(text_tfidf)[0]
        confidence = np.max(probabilities)
        classes = model.classes_
        prob_dict = dict(zip(classes, probabilities))
    else:
        confidence = 1.0  # For models without probability support
        prob_dict = {prediction: 1.0}
    
    return prediction, confidence, prob_dict

print(f"\n🔮 Testing enhanced model with realistic crime patterns:")
print("=" * 60)

# More realistic test cases based on actual data patterns
enhanced_test_cases = [
    "district_mumbai crime_theft crime_theft crime_burglary crime_cheating crime_fraud period_recent",
    "district_delhi crime_murder crime_rape crime_kidnapping crime_theft crime_riots period_recent", 
    "district_chennai crime_dowry_deaths crime_assault crime_theft period_recent",
    "district_kolkata crime_theft crime_burglary crime_hurt period_mid_2000s",
    "district_patna crime_murder crime_kidnapping crime_dacoity crime_riots period_recent",
    "district_jaipur crime_theft crime_cheating crime_burglary period_recent"
]

for i, test_case in enumerate(enhanced_test_cases, 1):
    print(f"\n🔍 Enhanced Test Case {i}:")
    print(f"Pattern: '{test_case}'")
    
    prediction, confidence, probabilities = predict_state_from_crime_pattern(
        test_case, best_model, tfidf
    )
    
    print(f"Predicted State: {prediction}")
    print(f"Confidence: {confidence:.3f}")
    
    # Show top 3 predictions
    if len(probabilities) > 1:
        sorted_probs = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
        print("Top 3 predictions:")
        for j, (state, prob) in enumerate(sorted_probs[:3], 1):
            print(f"  {j}. {state}: {prob:.3f}")
    print("-" * 50)

# ============================================================================
# STEP 8: INTERACTIVE ENHANCED PREDICTION
# ============================================================================

def interactive_enhanced_prediction():
    """Enhanced interactive prediction with better guidance"""
    print(f"\n🎯 Interactive Enhanced Crime Pattern Prediction")
    print("Tips for better predictions:")
    print("- Include district name: district_[name]")
    print("- Add crime types: crime_murder crime_theft crime_robbery")
    print("- Optionally add time period: period_recent period_early_2000s")
    print("- Example: 'district_bangalore crime_theft crime_cheating crime_fraud period_recent'")
    print("\nEnter pattern or 'quit':")
    
    while True:
        user_input = input("\n📝 Enhanced Pattern: ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        
        if len(user_input.strip()) == 0:
            print("❌ Please enter a valid pattern.")
            continue
        
        try:
            prediction, confidence, probabilities = predict_state_from_crime_pattern(
                user_input, best_model, tfidf
            )
            
            print(f"\n🎯 Enhanced Prediction Results:")
            print(f"Predicted State: {prediction}")
            print(f"Confidence: {confidence:.3f}")
            
            if len(probabilities) > 1:
                print(f"\nTop 5 State Probabilities:")
                sorted_probs = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
                for state, prob in sorted_probs[:5]:
                    print(f"  {state}: {prob:.3f}")
        
        except Exception as e:
            print(f"❌ Error: {e}")

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print(f"\n✅ Enhanced Crime Classification Pipeline Completed!")
print(f"📊 Enhanced Model Summary:")
print(f"   - Dataset Size: {len(df)} records")
print(f"   - States/UTs: {len(y.unique())}")
print(f"   - Best Model: {best_model_name}")
print(f"   - Best Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")
print(f"   - TF-IDF Features: {len(tfidf.vocabulary_)}")
print(f"   - Crime Features Used: {len(crime_columns)}")

print(f"\n🔧 Key Improvements Made:")
print(f"   ✅ Enhanced data cleaning and standardization")
print(f"   ✅ Focus on major crime categories only")
print(f"   ✅ Balanced class weights for better performance")
print(f"   ✅ Multiple model comparison")
print(f"   ✅ Optimized TF-IDF parameters")
print(f"   ✅ Better feature engineering with time periods")

# Uncomment to run interactive predictions
# interactive_enhanced_prediction()

🚀 Enhanced Crime Classification Pipeline...
📊 Improvements: Data cleaning, Feature selection, Model comparison
📂 Loading CSV files from: C:\Users\ssk08\OneDrive\Desktop\NLP Project\Model 2\crime
📄 Found 57 CSV files
   ✅ Loaded 01_District_wise_crimes_committed_IPC_2001_2012.csv: 9017 rows
   ✅ Loaded 01_District_wise_crimes_committed_IPC_2013.csv: 823 rows
   ✅ Loaded 01_District_wise_crimes_committed_IPC_2014.csv: 838 rows
   ✅ Loaded 02_01_District_wise_crimes_committed_against_SC_2001_2012.csv: 9018 rows
   ✅ Loaded 02_01_District_wise_crimes_committed_against_SC_2013.csv: 823 rows
   ✅ Loaded 02_01_District_wise_crimes_committed_against_SC_2014.csv: 837 rows
   ✅ Loaded 02_District_wise_crimes_committed_against_ST_2001_2012.csv: 9018 rows
   ✅ Loaded 02_District_wise_crimes_committed_against_ST_2013.csv: 823 rows
   ✅ Loaded 02_District_wise_crimes_committed_against_ST_2014.csv: 837 rows
   ✅ Loaded 03_District_wise_crimes_committed_against_children_2001_2012.csv: 9015 rows
   ✅ L

In [1]:
# Enhanced Crime Classification using TF-IDF and Multiple Models
# Improved version addressing data quality and performance issues
# Added model saving and loading functionality

import pandas as pd
import numpy as np
import os
import glob
import joblib
import pickle
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

print("🚀 Enhanced Crime Classification Pipeline...")
print("📊 Improvements: Data cleaning, Feature selection, Model comparison, Model Saving")
print("=" * 70)

# ============================================================================
# STEP 1: IMPROVED DATA LOADING AND CLEANING
# ============================================================================

def load_and_merge_csv_files(folder_path):
    """Enhanced CSV loading with better error handling"""
    print(f"📂 Loading CSV files from: {folder_path}")
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    
    print(f"📄 Found {len(csv_files)} CSV files")
    dataframes = []
    
    for file in csv_files:
        try:
            df = pd.read_csv(file, low_memory=False)
            if len(df) > 0:  # Only add non-empty dataframes
                dataframes.append(df)
                print(f"   ✅ Loaded {os.path.basename(file)}: {len(df)} rows")
        except Exception as e:
            print(f"   ⚠️ Skipped {os.path.basename(file)}: {e}")
    
    if not dataframes:
        raise ValueError("No valid dataframes loaded")
    
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"🔗 Combined dataset: {combined_df.shape}")
    return combined_df

def clean_and_standardize_data(df):
    """Enhanced data cleaning with standardization"""
    print("\n🧹 Enhanced data cleaning and standardization...")
    
    # Standardize state names
    if 'STATE/UT' in df.columns:
        df['STATE/UT'] = df['STATE/UT'].astype(str).str.upper().str.strip()
        # Remove totals and summary rows
        df = df[~df['STATE/UT'].str.contains('TOTAL|Total|ALL-INDIA|All-India', na=False)]
    
    # Standardize district names
    if 'DISTRICT' in df.columns:
        df['DISTRICT'] = df['DISTRICT'].astype(str).str.upper().str.strip()
        df = df[~df['DISTRICT'].str.contains('TOTAL|Total', na=False)]
    
    # Remove rows with missing key columns
    before_count = len(df)
    df = df.dropna(subset=['STATE/UT', 'DISTRICT']).reset_index(drop=True)
    after_count = len(df)
    
    print(f"📊 Removed {before_count - after_count} rows with missing STATE/DISTRICT")
    print(f"📊 Remaining samples: {after_count}")
    
    return df

def select_major_crime_features(df):
    """Select only major crime categories for better feature quality"""
    print("\n🎯 Selecting major crime features...")
    
    # Define major crime categories (most common and interpretable)
    major_crime_patterns = [
        'MURDER', 'RAPE', 'KIDNAPPING', 'DACOITY', 'ROBBERY', 
        'BURGLARY', 'THEFT', 'RIOTS', 'CHEATING', 'ARSON',
        'HURT', 'DOWRY', 'ASSAULT', 'FRAUD', 'EXTORTION'
    ]
    
    # Find columns that match major crime patterns
    selected_columns = []
    for col in df.columns:
        col_upper = col.upper()
        if any(crime in col_upper for crime in major_crime_patterns):
            # Skip very specific subcategories to reduce noise
            if not any(skip in col_upper for skip in ['TOTAL', 'GRAND', 'SECTION']):
                selected_columns.append(col)
    
    print(f"📋 Selected {len(selected_columns)} major crime columns")
    print(f"Sample columns: {selected_columns[:5]}")
    
    return selected_columns

# Load and clean data
crime_folder = r"C:\Users\ssk08\OneDrive\Desktop\NLP Project\Model 2\crime"
df = load_and_merge_csv_files(crime_folder)
df = clean_and_standardize_data(df)

# Check data quality
print(f"\n📊 Data Quality Check:")
print(f"Unique states: {df['STATE/UT'].nunique()}")
print(f"Unique districts: {df['DISTRICT'].nunique()}")

# Filter out states with too few samples for reliable training
min_samples_per_state = 50  # Increased threshold
state_counts = df['STATE/UT'].value_counts()
valid_states = state_counts[state_counts >= min_samples_per_state].index
df = df[df['STATE/UT'].isin(valid_states)].reset_index(drop=True)

print(f"📊 After filtering (min {min_samples_per_state} samples): {len(df)} samples, {df['STATE/UT'].nunique()} states")
print(f"Top states: {list(df['STATE/UT'].value_counts().head().index)}")

# ============================================================================
# STEP 2: ENHANCED FEATURE ENGINEERING
# ============================================================================

print("\n🔧 Enhanced feature engineering...")

# Select major crime features
crime_columns = select_major_crime_features(df)

# Fill missing values with 0 for crime counts
for col in crime_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

def create_enhanced_crime_description(row):
    """Create more sophisticated crime pattern descriptions"""
    descriptions = []
    
    # Add district identifier
    district = str(row['DISTRICT']).lower().replace(' ', '_').replace('.', '').replace('(', '').replace(')', '')
    descriptions.append(f"district_{district}")
    
    # Add year if available
    if 'YEAR' in row and not pd.isna(row['YEAR']):
        year = int(row['YEAR'])
        if year < 2005:
            descriptions.append("period_early_2000s")
        elif year < 2010:
            descriptions.append("period_mid_2000s")
        else:
            descriptions.append("period_recent")
    
    # Process crime counts with better scaling
    crime_totals = []
    for col in crime_columns:
        if col in row and not pd.isna(row[col]):
            count = max(0, int(row[col]))
            if count > 0:
                crime_totals.append((col, count))
    
    # Sort crimes by frequency to emphasize major patterns
    crime_totals.sort(key=lambda x: x[1], reverse=True)
    
    # Add crime patterns with intelligent scaling
    total_crimes = sum(count for _, count in crime_totals) or 1
    
    for crime_type, count in crime_totals:
        if count > 0:
            # Normalize crime name
            crime_name = crime_type.lower().replace(' ', '_').replace('/', '_').replace('&', 'and')
            
            # Calculate relative frequency
            frequency_ratio = count / total_crimes
            
            # Scale based on both absolute count and relative frequency
            if frequency_ratio > 0.3:  # Very dominant crime
                repetitions = min(20, max(5, int(count / 50)))
            elif frequency_ratio > 0.1:  # Significant crime
                repetitions = min(10, max(2, int(count / 100)))
            elif count > 10:  # Notable crime
                repetitions = min(5, max(1, int(count / 200)))
            else:  # Minor crime
                repetitions = 1
            
            descriptions.extend([f"crime_{crime_name}"] * repetitions)
    
    return ' '.join(descriptions)

print("🔤 Creating enhanced text representations...")
df['crime_text'] = df.apply(create_enhanced_crime_description, axis=1)

# Remove empty crime descriptions
df = df[df['crime_text'].str.len() > 10].reset_index(drop=True)
print(f"📊 Final dataset: {len(df)} samples")

# Display sample enhanced descriptions
print(f"\n📝 Sample enhanced crime descriptions:")
for i in range(min(3, len(df))):
    print(f"State: {df.iloc[i]['STATE/UT']}")
    print(f"District: {df.iloc[i]['DISTRICT']}")
    print(f"Enhanced Text: {df.iloc[i]['crime_text'][:150]}...")
    print("-" * 50)

# ============================================================================
# STEP 3: BALANCED DATA SPLITTING
# ============================================================================

print(f"\n✂️ Enhanced data splitting...")

X = df['crime_text']
y = df['STATE/UT']

# Check class distribution
y_counts = y.value_counts()
print(f"📊 Class distribution stats:")
print(f"   Most samples: {y_counts.iloc[0]} ({y_counts.index[0]})")
print(f"   Least samples: {y_counts.iloc[-1]} ({y_counts.index[-1]})")
print(f"   Mean samples per state: {y_counts.mean():.1f}")

# Split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"📈 Training set: {len(X_train)} samples")
print(f"📉 Test set: {len(X_test)} samples")
print(f"🎯 States to predict: {len(y.unique())}")

# ============================================================================
# STEP 4: OPTIMIZED TF-IDF VECTORIZATION
# ============================================================================

print(f"\n🔤 Optimized TF-IDF vectorization...")

# Optimized TF-IDF parameters
tfidf = TfidfVectorizer(
    max_features=5000,      # Increased feature limit
    ngram_range=(1, 3),     # Include trigrams for better patterns
    min_df=3,               # Require terms in at least 3 documents
    max_df=0.8,             # Remove very common terms
    stop_words='english',
    sublinear_tf=True,      # Use sublinear term frequency
    norm='l2'               # L2 normalization
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"✅ TF-IDF completed")
print(f"📊 Feature matrix: {X_train_tfidf.shape}")
print(f"📚 Vocabulary size: {len(tfidf.vocabulary_)}")

# ============================================================================
# STEP 5: MODEL COMPARISON WITH HYPERPARAMETER TUNING
# ============================================================================

print(f"\n🤖 Training and comparing multiple models...")

# Calculate class weights for handling imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

models = {
    'Logistic Regression': LogisticRegression(
        random_state=42, max_iter=1000, class_weight='balanced', C=1.0
    ),
    'Random Forest': RandomForestClassifier(
        random_state=42, n_estimators=100, class_weight='balanced', max_depth=20
    ),
    'SVM (Linear)': SVC(
        random_state=42, kernel='linear', class_weight='balanced', probability=True
    )
}

model_results = {}

for model_name, model in models.items():
    print(f"\n🔄 Training {model_name}...")
    
    # Train model
    model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    model_results[model_name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    print(f"✅ {model_name} Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Find best model
best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['accuracy'])
best_model = model_results[best_model_name]['model']
best_accuracy = model_results[best_model_name]['accuracy']
best_predictions = model_results[best_model_name]['predictions']

print(f"\n🏆 Best Model: {best_model_name} with {best_accuracy:.4f} accuracy")

# ============================================================================
# STEP 6: MODEL SAVING FUNCTIONALITY
# ============================================================================

def save_model_components(model, vectorizer, model_name, accuracy, metadata=None):
    """Save all model components with metadata"""
    print(f"\n💾 Saving model components...")
    
    # Create models directory if it doesn't exist
    models_dir = "saved_models"
    os.makedirs(models_dir, exist_ok=True)
    
    # Create timestamp for versioning
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create model-specific directory
    model_dir = os.path.join(models_dir, f"{model_name.replace(' ', '_').lower()}_{timestamp}")
    os.makedirs(model_dir, exist_ok=True)
    
    # Save model
    model_path = os.path.join(model_dir, "model.pkl")
    joblib.dump(model, model_path)
    print(f"✅ Model saved: {model_path}")
    
    # Save vectorizer
    vectorizer_path = os.path.join(model_dir, "vectorizer.pkl")
    joblib.dump(vectorizer, vectorizer_path)
    print(f"✅ Vectorizer saved: {vectorizer_path}")
    
    # Save metadata
    model_metadata = {
        'model_name': model_name,
        'accuracy': accuracy,
        'training_date': datetime.now().isoformat(),
        'n_features': len(vectorizer.vocabulary_),
        'n_classes': len(model.classes_) if hasattr(model, 'classes_') else 'unknown',
        'classes': list(model.classes_) if hasattr(model, 'classes_') else [],
        'vectorizer_params': vectorizer.get_params(),
        'model_params': model.get_params(),
        'dataset_size': len(X_train) + len(X_test),
        'train_size': len(X_train),
        'test_size': len(X_test)
    }
    
    if metadata:
        model_metadata.update(metadata)
    
    metadata_path = os.path.join(model_dir, "metadata.pkl")
    with open(metadata_path, 'wb') as f:
        pickle.dump(model_metadata, f)
    print(f"✅ Metadata saved: {metadata_path}")
    
    # Save human-readable info
    info_path = os.path.join(model_dir, "model_info.txt")
    with open(info_path, 'w') as f:
        f.write(f"Crime Classification Model Information\n")
        f.write(f"====================================\n\n")
        f.write(f"Model: {model_name}\n")
        f.write(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)\n")
        f.write(f"Training Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Features: {len(vectorizer.vocabulary_)}\n")
        f.write(f"Classes: {len(model.classes_) if hasattr(model, 'classes_') else 'unknown'}\n")
        f.write(f"Dataset Size: {len(X_train) + len(X_test)}\n\n")
        f.write(f"Model Parameters:\n")
        for param, value in model.get_params().items():
            f.write(f"  {param}: {value}\n")
        f.write(f"\nVectorizer Parameters:\n")
        for param, value in vectorizer.get_params().items():
            f.write(f"  {param}: {value}\n")
    print(f"✅ Model info saved: {info_path}")
    
    return model_dir

def load_model_components(model_dir):
    """Load saved model components"""
    print(f"\n📥 Loading model from: {model_dir}")
    
    # Load model
    model_path = os.path.join(model_dir, "model.pkl")
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found: {model_path}")
    model = joblib.load(model_path)
    print(f"✅ Model loaded: {model_path}")
    
    # Load vectorizer
    vectorizer_path = os.path.join(model_dir, "vectorizer.pkl")
    if not os.path.exists(vectorizer_path):
        raise FileNotFoundError(f"Vectorizer file not found: {vectorizer_path}")
    vectorizer = joblib.load(vectorizer_path)
    print(f"✅ Vectorizer loaded: {vectorizer_path}")
    
    # Load metadata
    metadata_path = os.path.join(model_dir, "metadata.pkl")
    metadata = None
    if os.path.exists(metadata_path):
        with open(metadata_path, 'rb') as f:
            metadata = pickle.load(f)
        print(f"✅ Metadata loaded: {metadata_path}")
    
    return model, vectorizer, metadata

# Save the best model
additional_metadata = {
    'crime_columns': crime_columns,
    'states_included': list(y.unique()),
    'min_samples_per_state': min_samples_per_state,
    'test_accuracy_details': classification_report(y_test, best_predictions, output_dict=True)
}

saved_model_dir = save_model_components(
    best_model, 
    tfidf, 
    best_model_name, 
    best_accuracy,
    additional_metadata
)

print(f"\n🎉 Model successfully saved to: {saved_model_dir}")

# ============================================================================
# STEP 7: MODEL LOADING AND TESTING FUNCTIONALITY
# ============================================================================

class CrimeClassificationPredictor:
    """A class for making predictions with saved models"""
    
    def __init__(self, model_dir):
        """Initialize predictor with saved model"""
        self.model, self.vectorizer, self.metadata = load_model_components(model_dir)
        self.model_dir = model_dir
        
    def predict_state_from_crime_pattern(self, crime_text):
        """Make prediction from crime pattern text"""
        text_tfidf = self.vectorizer.transform([crime_text])
        prediction = self.model.predict(text_tfidf)[0]
        
        if hasattr(self.model, 'predict_proba'):
            probabilities = self.model.predict_proba(text_tfidf)[0]
            confidence = np.max(probabilities)
            classes = self.model.classes_
            prob_dict = dict(zip(classes, probabilities))
        else:
            confidence = 1.0
            prob_dict = {prediction: 1.0}
        
        return prediction, confidence, prob_dict
    
    def get_model_info(self):
        """Get model information"""
        return self.metadata
    
    def predict_batch(self, crime_texts):
        """Make batch predictions"""
        texts_tfidf = self.vectorizer.transform(crime_texts)
        predictions = self.model.predict(texts_tfidf)
        
        if hasattr(self.model, 'predict_proba'):
            probabilities = self.model.predict_proba(texts_tfidf)
            confidences = np.max(probabilities, axis=1)
        else:
            confidences = np.ones(len(predictions))
        
        return predictions, confidences

# Demonstrate model loading and prediction
print(f"\n🔄 Testing model loading and prediction...")

# Load the saved model
predictor = CrimeClassificationPredictor(saved_model_dir)

# Test with sample crime patterns
test_cases = [
    "district_mumbai crime_theft crime_theft crime_burglary crime_cheating crime_fraud period_recent",
    "district_delhi crime_murder crime_rape crime_kidnapping crime_theft crime_riots period_recent", 
    "district_chennai crime_dowry_deaths crime_assault crime_theft period_recent",
]

print(f"\n🧪 Testing loaded model with sample cases:")
for i, test_case in enumerate(test_cases, 1):
    prediction, confidence, probabilities = predictor.predict_state_from_crime_pattern(test_case)
    print(f"\nTest Case {i}: {test_case[:60]}...")
    print(f"Prediction: {prediction} (Confidence: {confidence:.3f})")

# ============================================================================
# STEP 8: ENHANCED EVALUATION
# ============================================================================

print(f"\n📈 Detailed evaluation of best model ({best_model_name})...")

# Classification report
print(f"\n📋 Classification Report:")
print("=" * 60)
print(classification_report(y_test, best_predictions))

# Top performing states analysis
print(f"\n🔍 Model Performance Analysis:")
report_dict = classification_report(y_test, best_predictions, output_dict=True)

# Get F1 scores for each state
state_f1_scores = [(state, scores['f1-score']) for state, scores in report_dict.items() 
                   if isinstance(scores, dict) and 'f1-score' in scores]
state_f1_scores.sort(key=lambda x: x[1], reverse=True)

print(f"📊 Top 5 Best Predicted States:")
for i, (state, f1) in enumerate(state_f1_scores[:5], 1):
    print(f"   {i}. {state}: F1-Score = {f1:.3f}")

print(f"\n📊 Bottom 5 States (Need Improvement):")
for i, (state, f1) in enumerate(state_f1_scores[-5:], 1):
    print(f"   {i}. {state}: F1-Score = {f1:.3f}")

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print(f"\n✅ Enhanced Crime Classification Pipeline with Model Saving Completed!")
print(f"📊 Enhanced Model Summary:")
print(f"   - Dataset Size: {len(df)} records")
print(f"   - States/UTs: {len(y.unique())}")
print(f"   - Best Model: {best_model_name}")
print(f"   - Best Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")
print(f"   - TF-IDF Features: {len(tfidf.vocabulary_)}")
print(f"   - Crime Features Used: {len(crime_columns)}")
print(f"   - Model Saved To: {saved_model_dir}")

print(f"\n🔧 Key Features Added:")
print(f"   ✅ Complete model saving with metadata")
print(f"   ✅ Model loading and prediction class")
print(f"   ✅ Batch prediction capability")
print(f"   ✅ Human-readable model information")
print(f"   ✅ Versioned model storage")
print(f"   ✅ Error handling for model I/O")

print(f"\n📁 Saved Files:")
print(f"   - model.pkl (trained model)")
print(f"   - vectorizer.pkl (TF-IDF vectorizer)")
print(f"   - metadata.pkl (model metadata)")
print(f"   - model_info.txt (human-readable info)")

print(f"\n🚀 To use the saved model later:")
print(f"   predictor = CrimeClassificationPredictor('{saved_model_dir}')")
print(f"   prediction = predictor.predict_state_from_crime_pattern('your_crime_text')")

🚀 Enhanced Crime Classification Pipeline...
📊 Improvements: Data cleaning, Feature selection, Model comparison, Model Saving
📂 Loading CSV files from: C:\Users\ssk08\OneDrive\Desktop\NLP Project\Model 2\crime
📄 Found 57 CSV files
   ✅ Loaded 01_District_wise_crimes_committed_IPC_2001_2012.csv: 9017 rows
   ✅ Loaded 01_District_wise_crimes_committed_IPC_2013.csv: 823 rows
   ✅ Loaded 01_District_wise_crimes_committed_IPC_2014.csv: 838 rows
   ✅ Loaded 02_01_District_wise_crimes_committed_against_SC_2001_2012.csv: 9018 rows
   ✅ Loaded 02_01_District_wise_crimes_committed_against_SC_2013.csv: 823 rows
   ✅ Loaded 02_01_District_wise_crimes_committed_against_SC_2014.csv: 837 rows
   ✅ Loaded 02_District_wise_crimes_committed_against_ST_2001_2012.csv: 9018 rows
   ✅ Loaded 02_District_wise_crimes_committed_against_ST_2013.csv: 823 rows
   ✅ Loaded 02_District_wise_crimes_committed_against_ST_2014.csv: 837 rows
   ✅ Loaded 03_District_wise_crimes_committed_against_children_2001_2012.csv: 90