In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier

In [2]:
csv_file_path = "/home/pbhati12/New_kaggle_dataset.csv"
df = pd.read_csv(csv_file_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 21 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0         int64  
 1   ID                 object 
 2   Source             object 
 3   Severity           int64  
 4   Start_Time         object 
 5   End_Time           object 
 6   Distance(mi)       float64
 7   Description        object 
 8   City               object 
 9   State              object 
 10  Temperature(F)     float64
 11  Visibility(mi)     float64
 12  Wind_Speed(mph)    float64
 13  Precipitation(in)  float64
 14  Weather_Condition  object 
 15  Crossing           bool   
 16  Junction           bool   
 17  Station            bool   
 18  Stop               bool   
 19  Traffic_Signal     bool   
 20  Sunrise_Sunset     object 
dtypes: bool(5), float64(5), int64(2), object(9)
memory usage: 980.3+ MB


In [4]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [5]:
df.shape

(7728394, 20)

In [6]:
df.drop(columns=['Source'], inplace=True)

In [7]:
df = df.dropna(subset=['City'])

In [8]:
# mode imputation for precipitation

df['Precipitation(in)'].fillna(df['Precipitation(in)'].mode()[0], inplace=True)

In [9]:
## median imputation for wind speed

df['Wind_Speed(mph)'].fillna(df['Wind_Speed(mph)'].median(), inplace=True)

In [10]:
#median imputation for visibility

df.loc[:, 'Visibility(mi)'] = df['Visibility(mi)'].fillna(df['Visibility(mi)'].median())

In [11]:
# mean imputation for temperature
df.loc[:, 'Temperature(F)'] = df['Temperature(F)'].fillna(df['Temperature(F)'].mean())

In [12]:
## imputing Weather_condition with 'Unknown' values fo NA

df.loc[:, 'Weather_Condition'] = df['Weather_Condition'].fillna('Unknown')

In [13]:
# Removing rows with null values of sunrise_sunset

df = df.dropna(subset=['Sunrise_Sunset'])

In [14]:
# Removing the nano-seconds that are present in some of the rows
df['Start_Time'] = df['Start_Time'].str.split('.').str[0]

# Convert the 'time_column' to datetime format
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors="raise")

In [15]:
# Adding new time features
df['Year'] = df['Start_Time'].dt.year
df['Month'] = df['Start_Time'].dt.month
df['Day'] = df['Start_Time'].dt.day
df['Day_Name'] = df['Start_Time'].dt.day_name()
df['Is_Weekend'] = df['Start_Time'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)
df['Is_Rush_Hour'] = df['Start_Time'].dt.hour.apply(lambda x: 1 if ((6 <= x < 9) or (15 <= x < 18)) else 0)

In [16]:
# Creating 3 new, bad weather features
df['is_snow'] = df['Weather_Condition'].fillna('').apply(lambda x: 1 if 'snow' in x.lower() else 0)
df['is_rain'] = df['Weather_Condition'].fillna('').apply(lambda x: 1 if 'rain' in x.lower() else 0)
df['is_fog'] = df['Weather_Condition'].fillna('').apply(lambda x: 1 if 'fog' in x.lower() else 0)

In [17]:
# Add "isNight" column based on "time_of_day" column
df['isNight'] = df['Sunrise_Sunset'].apply(lambda x: 1 if x == 'Night' else 0)

In [18]:
# Create a dictionary mapping day names to numbers
day_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
               'Friday': 4, 'Saturday': 5, 'Sunday': 6}

# Add a new column "weekday" based on the "Day_Name" mapping
df['weekday'] = df['Day_Name'].map(day_mapping)

In [19]:
df['Severity'] = df['Severity'].astype('int32')
df['Distance(mi)'] = df['Distance(mi)'].astype('float32')
df['Temperature(F)'] = df['Temperature(F)'].astype('float32')
df['Visibility(mi)'] = df['Visibility(mi)'].astype('float32')
df['Wind_Speed(mph)'] = df['Wind_Speed(mph)'].astype('float32')
df['Precipitation(in)'] = df['Precipitation(in)'].astype('float32')
df['weekday'] = df['weekday'].astype('int32')
df['isNight'] = df['isNight'].astype('int32')
df['is_fog'] = df['is_fog'].astype('int32')
df['is_rain'] = df['is_rain'].astype('int32')
df['is_snow'] = df['is_snow'].astype('int32')
df['Is_Rush_Hour'] = df['Is_Rush_Hour'].astype('int32')
df['Is_Weekend'] = df['Is_Weekend'].astype('int32')

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Fit and transform the 'Category' column
df['State_encoded'] = le.fit_transform(df['State'])

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7705148 entries, 0 to 7728393
Data columns (total 31 columns):
 #   Column             Dtype         
---  ------             -----         
 0   ID                 object        
 1   Severity           int32         
 2   Start_Time         datetime64[ns]
 3   End_Time           object        
 4   Distance(mi)       float32       
 5   Description        object        
 6   City               object        
 7   State              object        
 8   Temperature(F)     float32       
 9   Visibility(mi)     float32       
 10  Wind_Speed(mph)    float32       
 11  Precipitation(in)  float32       
 12  Weather_Condition  object        
 13  Crossing           bool          
 14  Junction           bool          
 15  Station            bool          
 16  Stop               bool          
 17  Traffic_Signal     bool          
 18  Sunrise_Sunset     object        
 19  Year               int32         
 20  Month              int32     

In [22]:
# random forest

X = df.drop(columns=['ID' , 'Severity', 'Start_Time', 'End_Time', 'Description', 'City', 'State', 
                     'Day_Name', 'Weather_Condition', 'Sunrise_Sunset'])
y = df['Severity']

In [23]:
y = y-1

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
model = XGBClassifier(n_estimators=200,              # Number of boosting rounds
                      max_depth=6,                   # Maximum tree depth
                      learning_rate=0.1,             # Learning rate (shrinkage)
                      scale_pos_weight=1,            # Adjusts for imbalanced classes
                      eval_metric='logloss',         # Loss function for binary classification
                      use_label_encoder=False,       # Avoids unnecessary label encoding warning
                      verbosity=1)                   

In [26]:
model.fit(X_train, y_train)

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



In [27]:
y_pred = model.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score, classification_report


In [29]:
accuracy_score = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [30]:
print(accuracy_score)
print(report)

0.8287392198724229
              precision    recall  f1-score   support

           0       0.64      0.44      0.52     13495
           1       0.86      0.95      0.90   1226877
           2       0.61      0.38      0.47    259955
           3       0.54      0.08      0.14     40703

    accuracy                           0.83   1541030
   macro avg       0.66      0.46      0.51   1541030
weighted avg       0.81      0.83      0.81   1541030

