In [1]:
from sklearn.model_selection import train_test_split,GridSearchCV
import pandas as pd
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [2]:
df=pd.read_csv('https://raw.githubusercontent.com/santhoshkumaroff/MachineLearningDatasets/refs/heads/main/dataset_traffic_accident_prediction1.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Weather               798 non-null    object 
 1   Road_Type             798 non-null    object 
 2   Time_of_Day           798 non-null    object 
 3   Traffic_Density       798 non-null    float64
 4   Speed_Limit           798 non-null    float64
 5   Number_of_Vehicles    798 non-null    float64
 6   Driver_Alcohol        798 non-null    float64
 7   Accident_Severity     798 non-null    object 
 8   Road_Condition        798 non-null    object 
 9   Vehicle_Type          798 non-null    object 
 10  Driver_Age            798 non-null    float64
 11  Driver_Experience     798 non-null    float64
 12  Road_Light_Condition  798 non-null    object 
 13  Accident              798 non-null    float64
dtypes: float64(7), object(7)
memory usage: 92.0+ KB


In [4]:
df.head(20)

Unnamed: 0,Weather,Road_Type,Time_of_Day,Traffic_Density,Speed_Limit,Number_of_Vehicles,Driver_Alcohol,Accident_Severity,Road_Condition,Vehicle_Type,Driver_Age,Driver_Experience,Road_Light_Condition,Accident
0,Rainy,City Road,Morning,1.0,100.0,5.0,0.0,,Wet,Car,51.0,48.0,Artificial Light,0.0
1,Clear,Rural Road,Night,,120.0,3.0,0.0,Moderate,Wet,Truck,49.0,43.0,Artificial Light,0.0
2,Rainy,Highway,Evening,1.0,60.0,4.0,0.0,Low,Icy,Car,54.0,52.0,Artificial Light,0.0
3,Clear,City Road,Afternoon,2.0,60.0,3.0,0.0,Low,Under Construction,Bus,34.0,31.0,Daylight,0.0
4,Rainy,Highway,Morning,1.0,195.0,11.0,0.0,Low,Dry,Car,62.0,55.0,Artificial Light,1.0
5,Clear,Rural Road,Night,0.0,120.0,3.0,0.0,Moderate,,Truck,49.0,43.0,Artificial Light,0.0
6,Foggy,Highway,Afternoon,0.0,60.0,4.0,0.0,Low,Dry,Truck,27.0,26.0,Daylight,1.0
7,Rainy,City Road,Afternoon,0.0,60.0,4.0,0.0,Low,Dry,Car,29.0,22.0,Artificial Light,1.0
8,Stormy,Highway,Morning,1.0,60.0,2.0,0.0,High,Icy,Car,38.0,29.0,Artificial Light,0.0
9,Rainy,City Road,Afternoon,2.0,30.0,2.0,0.0,Low,Dry,Truck,50.0,48.0,Artificial Light,0.0


In [5]:
df.isnull().sum()

Weather                 42
Road_Type               42
Time_of_Day             42
Traffic_Density         42
Speed_Limit             42
Number_of_Vehicles      42
Driver_Alcohol          42
Accident_Severity       42
Road_Condition          42
Vehicle_Type            42
Driver_Age              42
Driver_Experience       42
Road_Light_Condition    42
Accident                42
dtype: int64

In [6]:

df_clean = df[df['Accident'].notnull()]


X=df_clean.drop('Accident',axis=1)
y=df_clean['Accident']
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,random_state=42,train_size=0.8)







In [7]:
cat_cols=X.select_dtypes(include='object').columns
num_cols=X.select_dtypes(exclude='object').columns


In [8]:
num_pipeline = Pipeline(
    steps=[
        ('num_impute',SimpleImputer(strategy="mean")),
        ('scalar', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('cat_impute',SimpleImputer(strategy='most_frequent')),
        ('encode',OneHotEncoder(sparse_output=False))
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ('num',num_pipeline,num_cols),
        ('cat',cat_pipeline,cat_cols)
    ]
)



pipe = Pipeline(
    steps=[
        ('preprocessing', preprocessor),
        ('model', RandomForestClassifier(random_state=42,n_estimators=100))
    ]
)

pipe.fit(Xtrain, ytrain)




0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [9]:
pipe.score(Xtest,ytest)

0.73125

In [10]:
pipe.score(Xtrain,ytrain)

1.0

# GridsearchCV

In [None]:
params={
    'model__criterion':['entropy','gini'],
    'model__min_samples_split':[2,3,4,5,10],
    'model__min_samples_leaf':[2,3,5,7,10],
    'model__max_depth':[5,10,50,100,200]
}


In [16]:
gridSearch=GridSearchCV(pipe,params,cv=5,n_jobs=-1)

In [17]:
gridSearch.fit(Xtrain,ytrain)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__criterion': ['entropy', 'gini'], 'model__max_depth': [5, 10, ...], 'model__min_samples_leaf': [2, 3, ...], 'model__min_samples_split': [2, 3, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'entropy'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [18]:
gridSearch.score(Xtest,ytest)

0.70625

In [19]:
gridSearch.score(Xtrain,ytrain)

0.7429467084639498

In [20]:
gridSearch.best_params_

{'model__criterion': 'entropy',
 'model__max_depth': 10,
 'model__min_samples_leaf': 3,
 'model__min_samples_split': 10}