In [9]:
import pandas as pd

# File path to the dataset
file_path = 'FARS_all_years_combined.csv'


accident_data = pd.read_csv(file_path, encoding='utf-8')
# Display basic information about the dataset
print("Dataset Information:")
print(accident_data.info())

# Display the first few rows
print("First 5 rows of the dataset:")
print(accident_data.head())


  accident_data = pd.read_csv(file_path, encoding='utf-8')


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375457 entries, 0 to 375456
Data columns (total 92 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   STATE         375457 non-null  int64  
 1   ST_CASE       375457 non-null  int64  
 2   VE_TOTAL      375457 non-null  int64  
 3   VE_FORMS      375457 non-null  int64  
 4   PVH_INVL      375457 non-null  int64  
 5   PEDS          375457 non-null  int64  
 6   PERNOTMVIT    375457 non-null  int64  
 7   PERMVIT       375457 non-null  int64  
 8   PERSONS       375457 non-null  int64  
 9   COUNTY        375457 non-null  int64  
 10  CITY          375457 non-null  int64  
 11  DAY           375457 non-null  int64  
 12  MONTH         375457 non-null  int64  
 13  YEAR          375457 non-null  int64  
 14  DAY_WEEK      375457 non-null  int64  
 15  HOUR          375457 non-null  int64  
 16  MINUTE        375457 non-null  int64  
 17  NHS           375457 non-nu

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Step 2: Preprocessing - Drop rows with missing values in relevant columns
columns_to_use = ['LATITUDE', 'LONGITUD', 'HOUR', 'WEATHERNAME', 'FATALS']
#accident_data = accident_data[columns_to_use].dropna()

# Step 3: Define features (X) and target variable (y)
X = accident_data[['LATITUDE', 'LONGITUD', 'HOUR', 'WEATHERNAME']]
y = accident_data['FATALS']

# Step 4: Preprocessing - Handle categorical data and scaling
categorical_features = ['WEATHERNAME']
numerical_features = ['LATITUDE', 'LONGITUD', 'HOUR']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)
# Step 5: Save the cleaned dataset
accident_data.to_csv("cleaned_data", index=False)

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Create a pipeline with preprocessing and regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Step 7: Train the model
pipeline.fit(X_train, y_train)

# Step 8: Make predictions
y_pred = pipeline.predict(X_test)

# Step 9: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Predicted Number of Fatalities:")
print(y_pred)
print("Model Evaluation:")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Step 10: Save the trained mode
import joblib
model_file_path = 'regression_model.pkl'
joblib.dump(pipeline, model_file_path)
print(f"Trained model saved to {model_file_path}")


Predicted Number of Fatalities:
[1.08555444 1.08691475 1.0899397  ... 1.08500317 1.08874924 1.09597315]
Model Evaluation:
Mean Squared Error: 0.12565506690348813
R-squared: 0.0005103918652263895
Trained model saved to regression_model.pkl


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 2: Preprocessing - Drop rows with missing values in relevant columns
columns_to_use = ['LATITUDE', 'LONGITUD', 'HOUR', 'WEATHERNAME', 'FATALS']
accident_data = accident_data[columns_to_use].dropna()

# Step 3: Define features (X) and target variables (y_latitude, y_longitude)
X = accident_data[['HOUR', 'WEATHERNAME', 'FATALS']]
y_latitude = accident_data['LATITUDE']
y_longitude = accident_data['LONGITUD']

# Step 4: Preprocessing - Handle categorical data and scaling
categorical_features = ['WEATHERNAME']
numerical_features = ['HOUR', 'FATALS']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Step 5: Split data into training and testing sets
X_train, X_test, y_lat_train, y_lat_test = train_test_split(X, y_latitude, test_size=0.2, random_state=42)
X_train, X_test, y_lon_train, y_lon_test = train_test_split(X, y_longitude, test_size=0.2, random_state=42)

# Step 6: Create pipelines for both latitude and longitude
latitude_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

longitude_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Step 7: Train the models
latitude_pipeline.fit(X_train, y_lat_train)
longitude_pipeline.fit(X_train, y_lon_train)

# Step 8: Make predictions
y_lat_pred = latitude_pipeline.predict(X_test)
y_lon_pred = longitude_pipeline.predict(X_test)

# Step 9: Evaluate the models
lat_mse = mean_squared_error(y_lat_test, y_lat_pred)
lat_r2 = r2_score(y_lat_test, y_lat_pred)
lon_mse = mean_squared_error(y_lon_test, y_lon_pred)
lon_r2 = r2_score(y_lon_test, y_lon_pred)

print("Latitude Model Evaluation:")
print(f"Mean Squared Error: {lat_mse}")
print(f"R-squared: {lat_r2}")

print("\nLongitude Model Evaluation:")
print(f"Mean Squared Error: {lon_mse}")
print(f"R-squared: {lon_r2}")

# Step 10: Save the trained models (optional)
import joblib
joblib.dump(latitude_pipeline, 'latitude_model.pkl')
joblib.dump(longitude_pipeline, 'longitude_model.pkl')
print("Trained models saved.")

# Step 11: Predict latitude and longitude for new data
new_data = pd.DataFrame({
    'HOUR': [15],         # Example hour
    'WEATHERNAME': ['Clear'],  # Example weather
    'FATALS': [2]         # Example fatalities
})

predicted_latitude = latitude_pipeline.predict(new_data)
predicted_longitude = longitude_pipeline.predict(new_data)

print("\nPredicted Coordinates:")
print(f"Latitude: {predicted_latitude[0]}")
print(f"Longitude: {predicted_longitude[0]}")


Latitude Model Evaluation:
Mean Squared Error: 39.02481202100808
R-squared: 0.03366108061857176

Longitude Model Evaluation:
Mean Squared Error: 4561.010494950631
R-squared: 0.023418425953329214
Trained models saved.

Predicted Coordinates:
Latitude: 36.3331917801191
Longitude: -90.46239571762254
