In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Define file path and chunk size
file_path = '/content/drive/MyDrive/US_Accidents_March23.csv'  # Replace with your dataset file path
chunk_size = 1000

# Initialize variables
processed_data = []

# Define a function for data cleaning and feature engineering
def process_chunk(chunk):
    # Drop irrelevant columns (customize as needed)
    columns_to_drop = ['ID', 'Start_Lat', 'Start_Lng']  # Example columns
    chunk = chunk.drop(columns=columns_to_drop, errors='ignore')

    # Handle missing values
    chunk = chunk.dropna()

    # Feature engineering example (convert date to day of week, hour, etc.)
    try:
        chunk['Start_Time'] = pd.to_datetime(chunk['Start_Time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    except ValueError:
        chunk['Start_Time'] = pd.to_datetime(chunk['Start_Time'], errors='coerce')

    # Drop rows where datetime conversion failed
    chunk = chunk.dropna(subset=['Start_Time'])

    chunk['Day_of_Week'] = chunk['Start_Time'].dt.dayofweek
    chunk['Hour'] = chunk['Start_Time'].dt.hour
    chunk = chunk.drop(columns=['Start_Time'], errors='ignore')

    # Encode categorical variables
    chunk = pd.get_dummies(chunk, drop_first=True)

    return chunk


# Read the dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    cleaned_chunk = process_chunk(chunk)
    processed_data.append(cleaned_chunk)

# Concatenate all processed chunks
data = pd.concat(processed_data, ignore_index=True)

# Define target and features
target = 'Severity'  # Replace with your target column
X = data.drop(columns=[target])
y = data[target]

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.25, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_pred_val = model.predict(X_val)
print("Validation Classification Report:")
print(classification_report(y_val, y_pred_val))

# Test the model
y_pred_test = model.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_pred_test))

# Save the model
joblib.dump(model, 'accident_severity_model.pkl')

# Recommendations based on model insights
def generate_recommendations():
    print("\nRecommendations:")
    print("1. Implement stricter traffic rules during high-risk hours.")
    print("2. Improve infrastructure in accident-prone areas.")
    print("3. Educate drivers about weather-related risks.")

generate_recommendations()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Start_Time'] = pd.to_datetime(chunk['Start_Time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Start_Time'] = pd.to_datetime(chunk['Start_Time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Define file path and chunk size
file_path = '/content/drive/MyDrive/US_Accidents_March23.csv'  # Replace with your dataset file path
chunk_size = 1000

# Initialize variables
processed_data = []

# Define a function for data cleaning and feature engineering
def process_chunk(chunk):
    # Drop irrelevant columns (customize as needed)
    columns_to_drop = ['ID', 'Start_Lat', 'Start_Lng']  # Example columns
    chunk = chunk.drop(columns=columns_to_drop, errors='ignore')

    # Handle missing values
    chunk = chunk.dropna()

    # Feature engineering example (convert date to day of week, hour, etc.)
    chunk['Start_Time'] = pd.to_datetime(chunk['Start_Time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

    # Drop rows where datetime conversion failed
    chunk = chunk.loc[chunk['Start_Time'].notna()]

    # Extract day of the week and hour from Start_Time
    chunk['Day_of_Week'] = chunk['Start_Time'].dt.dayofweek
    chunk['Hour'] = chunk['Start_Time'].dt.hour

    # Drop Start_Time column
    chunk = chunk.drop(columns=['Start_Time'], errors='ignore')

    # Encode categorical variables
    chunk = pd.get_dummies(chunk, drop_first=True)

    return chunk


# Read the dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    cleaned_chunk = process_chunk(chunk)
    processed_data.append(cleaned_chunk)

# Concatenate all processed chunks
data = pd.concat(processed_data, ignore_index=True)

# Define target and features
target = 'Severity'  # Replace with your target column
X = data.drop(columns=[target])
y = data[target]

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.25, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_pred_val = model.predict(X_val)
print("Validation Classification Report:")
print(classification_report(y_val, y_pred_val))

# Test the model
y_pred_test = model.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_pred_test))

# Save the model
joblib.dump(model, 'accident_severity_model.pkl')

# Recommendations based on model insights
def generate_recommendations():
    print("\nRecommendations:")
    print("1. Implement stricter traffic rules during high-risk hours.")
    print("2. Improve infrastructure in accident-prone areas.")
    print("3. Educate drivers about weather-related risks.")

generate_recommendations()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Start_Time'] = pd.to_datetime(chunk['Start_Time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Start_Time'] = pd.to_datetime(chunk['Start_Time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 