In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
csv_file_path = "/home/pbhati12/New_kaggle_dataset.csv"
df = pd.read_csv(csv_file_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 21 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0         int64  
 1   ID                 object 
 2   Source             object 
 3   Severity           int64  
 4   Start_Time         object 
 5   End_Time           object 
 6   Distance(mi)       float64
 7   Description        object 
 8   City               object 
 9   State              object 
 10  Temperature(F)     float64
 11  Visibility(mi)     float64
 12  Wind_Speed(mph)    float64
 13  Precipitation(in)  float64
 14  Weather_Condition  object 
 15  Crossing           bool   
 16  Junction           bool   
 17  Station            bool   
 18  Stop               bool   
 19  Traffic_Signal     bool   
 20  Sunrise_Sunset     object 
dtypes: bool(5), float64(5), int64(2), object(9)
memory usage: 980.3+ MB


In [4]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [5]:
df.shape

(7728394, 20)

In [6]:
df.drop(columns=['Source'], inplace=True)

In [7]:
df = df.dropna(subset=['City'])

In [8]:
# mode imputation for precipitation

df['Precipitation(in)'].fillna(df['Precipitation(in)'].mode()[0], inplace=True)

In [9]:
## median imputation for wind speed

df['Wind_Speed(mph)'].fillna(df['Wind_Speed(mph)'].median(), inplace=True)

In [10]:
#median imputation for visibility

df.loc[:, 'Visibility(mi)'] = df['Visibility(mi)'].fillna(df['Visibility(mi)'].median())

In [11]:
# mean imputation for temperature
df.loc[:, 'Temperature(F)'] = df['Temperature(F)'].fillna(df['Temperature(F)'].mean())

In [12]:
## imputing Weather_condition with 'Unknown' values fo NA

df.loc[:, 'Weather_Condition'] = df['Weather_Condition'].fillna('Unknown')

In [13]:
# Removing rows with null values of sunrise_sunset

df = df.dropna(subset=['Sunrise_Sunset'])

In [14]:
# Removing the nano-seconds that are present in some of the rows
df['Start_Time'] = df['Start_Time'].str.split('.').str[0]

# Convert the 'time_column' to datetime format
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors="raise")

In [15]:
# Adding new time features
df['Year'] = df['Start_Time'].dt.year
df['Month'] = df['Start_Time'].dt.month
df['Day'] = df['Start_Time'].dt.day
df['Day_Name'] = df['Start_Time'].dt.day_name()
df['Is_Weekend'] = df['Start_Time'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)
df['Is_Rush_Hour'] = df['Start_Time'].dt.hour.apply(lambda x: 1 if ((6 <= x < 9) or (15 <= x < 18)) else 0)

In [16]:
# Creating 3 new, bad weather features
df['is_snow'] = df['Weather_Condition'].fillna('').apply(lambda x: 1 if 'snow' in x.lower() else 0)
df['is_rain'] = df['Weather_Condition'].fillna('').apply(lambda x: 1 if 'rain' in x.lower() else 0)
df['is_fog'] = df['Weather_Condition'].fillna('').apply(lambda x: 1 if 'fog' in x.lower() else 0)

In [17]:
# Add "isNight" column based on "time_of_day" column
df['isNight'] = df['Sunrise_Sunset'].apply(lambda x: 1 if x == 'Night' else 0)

In [18]:
# Create a dictionary mapping day names to numbers
day_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
               'Friday': 4, 'Saturday': 5, 'Sunday': 6}

# Add a new column "weekday" based on the "Day_Name" mapping
df['weekday'] = df['Day_Name'].map(day_mapping)

In [19]:
df['Severity'] = df['Severity'].astype('int32')
df['Distance(mi)'] = df['Distance(mi)'].astype('float32')
df['Temperature(F)'] = df['Temperature(F)'].astype('float32')
df['Visibility(mi)'] = df['Visibility(mi)'].astype('float32')
df['Wind_Speed(mph)'] = df['Wind_Speed(mph)'].astype('float32')
df['Precipitation(in)'] = df['Precipitation(in)'].astype('float32')
df['weekday'] = df['weekday'].astype('int32')
df['isNight'] = df['isNight'].astype('int32')
df['is_fog'] = df['is_fog'].astype('int32')
df['is_rain'] = df['is_rain'].astype('int32')
df['is_snow'] = df['is_snow'].astype('int32')
df['Is_Rush_Hour'] = df['Is_Rush_Hour'].astype('int32')
df['Is_Weekend'] = df['Is_Weekend'].astype('int32')

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Fit and transform the 'Category' column
df['State_encoded'] = le.fit_transform(df['State'])

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7705148 entries, 0 to 7728393
Data columns (total 31 columns):
 #   Column             Dtype         
---  ------             -----         
 0   ID                 object        
 1   Severity           int32         
 2   Start_Time         datetime64[ns]
 3   End_Time           object        
 4   Distance(mi)       float32       
 5   Description        object        
 6   City               object        
 7   State              object        
 8   Temperature(F)     float32       
 9   Visibility(mi)     float32       
 10  Wind_Speed(mph)    float32       
 11  Precipitation(in)  float32       
 12  Weather_Condition  object        
 13  Crossing           bool          
 14  Junction           bool          
 15  Station            bool          
 16  Stop               bool          
 17  Traffic_Signal     bool          
 18  Sunrise_Sunset     object        
 19  Year               int32         
 20  Month              int32     

In [22]:
# random forest

X = df.drop(columns=['ID' , 'Severity', 'Start_Time', 'End_Time', 'Description', 'City', 'State', 
                     'Day_Name', 'Weather_Condition', 'Sunrise_Sunset'])
y = df['Severity']

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
pca = PCA(n_components=0.95)  # Adjust this to control the amount of variance retained
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [28]:
rf = RandomForestClassifier(n_estimators=200, random_state=42, verbose=1, n_jobs=18)
rf.fit(X_train_pca, y_train)

[Parallel(n_jobs=18)]: Using backend ThreadingBackend with 18 concurrent workers.
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:  1.9min
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed: 17.6min
[Parallel(n_jobs=18)]: Done 200 out of 200 | elapsed: 21.7min finished


In [30]:
y_pred = rf.predict(X_test_pca)

[Parallel(n_jobs=18)]: Using backend ThreadingBackend with 18 concurrent workers.
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    2.6s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:   20.9s
[Parallel(n_jobs=18)]: Done 200 out of 200 | elapsed:   24.9s finished


In [31]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8146771964205759
              precision    recall  f1-score   support

           1       0.67      0.29      0.41     13495
           2       0.85      0.94      0.89   1226877
           3       0.57      0.34      0.42    259955
           4       0.47      0.20      0.28     40703

    accuracy                           0.81   1541030
   macro avg       0.64      0.44      0.50   1541030
weighted avg       0.79      0.81      0.79   1541030



In [32]:
loadings = pca.components_
loading_df = pd.DataFrame(loadings, columns=X.columns)


In [33]:
loading_df

Unnamed: 0,Distance(mi),Temperature(F),Visibility(mi),Wind_Speed(mph),Precipitation(in),Crossing,Junction,Station,Stop,Traffic_Signal,...,Month,Day,Is_Weekend,Is_Rush_Hour,is_snow,is_rain,is_fog,isNight,weekday,State_encoded
0,0.174824,-0.328243,-0.302858,-0.081973,0.043674,-0.303863,0.086782,-0.140713,-0.048621,-0.313697,...,-0.077713,0.008039,0.377809,-0.182126,0.258207,0.109291,0.149147,0.283574,0.357284,0.01055
1,0.023434,-0.317444,-0.458821,0.050982,0.110364,0.034564,-0.01356,-0.006434,-0.007644,0.06478,...,-0.077939,-0.0072,-0.445681,0.164866,0.326269,0.218168,0.222534,0.01864,-0.446251,0.170173
2,0.168025,0.116636,0.182754,-0.002596,-0.061737,-0.536499,0.202212,-0.281317,-0.097729,-0.513458,...,0.022412,-0.008178,-0.286131,0.119585,-0.111298,-0.095082,-0.10951,-0.126527,-0.29232,-0.081339
3,0.062581,0.194275,-0.152294,0.464122,0.182582,-0.036616,0.016012,-0.013164,-0.03087,-0.042479,...,-0.223941,0.03574,0.198344,0.329931,0.098706,0.268767,-0.077191,-0.580079,0.224421,0.051593
4,-0.190235,0.271795,-0.251466,-0.169728,0.295116,-0.106239,0.105953,-0.0866,-0.034699,-0.06199,...,0.477551,-0.021286,0.060431,-0.046138,-0.29798,0.393326,0.268558,-0.004287,0.065495,-0.051829
5,-0.071284,-0.172616,0.169035,0.351555,0.176221,-0.04559,0.158248,-0.102649,-0.107592,0.039497,...,0.08507,0.08346,-0.013198,-0.311281,0.139408,0.207513,-0.565838,0.199269,-0.012281,0.252595
6,0.129968,0.0968,0.032706,-0.053325,0.421954,0.075299,-0.059859,0.219876,0.159073,-0.067683,...,-0.278296,0.030475,-0.135532,-0.249038,-0.229877,0.383574,-0.189688,0.154722,-0.160048,-0.323064
7,0.163874,-0.038982,0.120734,-0.166429,0.136116,-0.042885,-0.57106,-0.244149,0.13484,-0.020234,...,0.143419,-0.143081,0.014267,0.118476,-0.181738,0.116153,-0.1026,-0.002563,0.018338,0.594531
8,0.196593,0.022854,0.030958,-0.135591,0.069084,0.019886,0.039019,-0.062308,-0.893128,0.295504,...,-0.074669,-0.043382,-0.007202,0.01881,-0.117394,0.056651,0.017093,0.016079,-0.006735,0.027086
9,0.134079,0.009176,-0.013531,0.048001,0.038137,0.037152,0.11969,0.124077,0.030092,-0.005707,...,0.119051,-0.945835,0.003761,-0.000508,0.126099,-0.014263,-0.106847,-0.011028,0.01492,-0.093639


In [34]:
top_features = {}
n_top_features = 5  # Number of top features per component to extract

for i in range(len(loading_df)):
    component = loading_df.iloc[i]
    top_features_in_component = component.abs().nlargest(n_top_features).index.tolist()
    top_features[f'PC{i+1}'] = top_features_in_component

print("Top contributing features for each principal component:")
print(top_features)


Top contributing features for each principal component:
{'PC1': ['Is_Weekend', 'weekday', 'Temperature(F)', 'Traffic_Signal', 'Crossing'], 'PC2': ['Visibility(mi)', 'weekday', 'Is_Weekend', 'is_snow', 'Temperature(F)'], 'PC3': ['Crossing', 'Traffic_Signal', 'weekday', 'Is_Weekend', 'Station'], 'PC4': ['isNight', 'Wind_Speed(mph)', 'Is_Rush_Hour', 'is_rain', 'weekday'], 'PC5': ['Month', 'is_rain', 'Year', 'is_snow', 'Precipitation(in)'], 'PC6': ['is_fog', 'Year', 'Wind_Speed(mph)', 'Is_Rush_Hour', 'State_encoded'], 'PC7': ['Precipitation(in)', 'Year', 'is_rain', 'State_encoded', 'Month'], 'PC8': ['State_encoded', 'Junction', 'Station', 'Year', 'is_snow'], 'PC9': ['Stop', 'Traffic_Signal', 'Distance(mi)', 'Year', 'Wind_Speed(mph)'], 'PC10': ['Day', 'Distance(mi)', 'is_snow', 'Station', 'Junction'], 'PC11': ['Distance(mi)', 'Month', 'is_snow', 'Day', 'Station'], 'PC12': ['Junction', 'Is_Rush_Hour', 'State_encoded', 'Distance(mi)', 'Wind_Speed(mph)'], 'PC13': ['Is_Rush_Hour', 'Precipitatio