In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## US Accidents - Applied Machine Learning

### 1. Understand the Problem Statement & Import Packages and Datasets :

In [None]:
# Warning Libraries :
import warnings
warnings.filterwarnings("ignore")

# Scientific and Data Manipulation Libraries :
import pandas as pd
import numpy as np
import math
import gc
import os
import category_encoders as ce


# ML Libraries :
from sklearn.preprocessing            import LabelEncoder, OneHotEncoder 
from sklearn.preprocessing            import StandardScaler, MinMaxScaler, Normalizer, RobustScaler, MaxAbsScaler
from sklearn.model_selection          import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.linear_model             import LogisticRegression
from sklearn                          import tree
from sklearn.ensemble                 import RandomForestClassifier
from sklearn.metrics                  import accuracy_score
from sklearn.metrics                  import f1_score,precision_score
#from sklearn.metrics                 import jaccard_similarity_score, jaccard_score  

# Boosting Algorithms :
from xgboost                          import XGBClassifier
from lightgbm                         import LGBMClassifier

# Data Visualization Libraries :
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px

In [None]:
np.random.seed(0)

 ### 2. Import the dataset

Created a list to store the state, here I have included the state Florida-FL.Since it is alist we can add multiple states in it.

Let's read the data from data source and check the attribite State, whether it contains the state Florida.And it provides the information of column names

In [None]:
# Import the data
state_lst=['FL']
df = pd.read_csv('/kaggle/input/us-accidents/US_Accidents_Dec20_Updated.csv')
df = df[df.State.isin(state_lst)]
df.info()


In [None]:
# Display Descriptive Statistics of data :

df.describe()

In [None]:
# Display No of Unqiue Values and Actual Unique Values :

def display_unique(data):
    for column in data.columns :
        
        print("No of Unique Values in "+column+" Column are : "+str(data[column].nunique()))
        print("Actual Unique Values in "+column+" Column are : "+str(data[column].sort_values(ascending=True,na_position='last').unique() ))
        print("")
        
display_unique(df)

### 3.Feature addition

We decided to decompose the Start_Time feature in year, month, day, weekday, hour and minute, in order to feed them to the models.

In [None]:
# Cast Start_Time to datetime 

df["Start_Time"] = pd.to_datetime(df["Start_Time"])

# Extract year, month, weekday and day
df["Year"] = df["Start_Time"].dt.year
df["Month"] = df["Start_Time"].dt.month
df["Weekday"] = df["Start_Time"].dt.weekday
df["Day"] = df["Start_Time"].dt.day

# Extract hour and minute
df["Hour"] = df["Start_Time"].dt.hour
df["Minute"] = df["Start_Time"].dt.minute

df.head()

In [None]:
fig = px.parallel_categories(df[["Side", "City", "Weekday", "Day","Hour","Minute", "Civil_Twilight",
                                   "Severity"]], 
                             color="Severity", 
                             color_continuous_scale=px.colors.sequential.Aggrnyl  )
fig.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(df.corr(), annot = True);
plt.show()

### Feature selection:
 
 Here is the process of feature selection, in order to select the best features from which  our models can learn.

From the observations made with the correlation matrix, we are going to drop some of the features.

In [None]:
features_to_drop = ["ID", "Start_Time", "End_Time", "End_Lat", "End_Lng","Description", "Number", "Street", "County", "State", "Zipcode",
                    "Country", "Timezone", "Airport_Code", "Weather_Timestamp", "Wind_Chill(F)", "Turning_Loop", "Sunrise_Sunset", "Nautical_Twilight", "Astronomical_Twilight","City","Civil_Twilight","Bump","Give_Way","No_Exit","Roundabout","Traffic_Calming"]
df=df.drop(features_to_drop, axis=1)
df.head()

### 4. Check for Duplicate Rows from Data if present :

In [None]:
# Python Method 4 : Removes Data Duplicates while Retaining the First one - Similar to SQL DISTINCT :

def remove_duplicate(data):
    
    print("BEFORE REMOVING DUPLICATES - No. of Rows = ",data.shape[0])
    data.drop_duplicates(keep="first", inplace=True) 
    print("AFTER REMOVING DUPLICATES  - No. of Rows = ",data.shape[0])
    
    return "Checked Duplicates"

# Remove Duplicates from data :

remove_duplicate(df)


### 5. Feature Engineering

If we analyze the weather conditions, we can see that there are lots of them, so it's better to reduce the number of unique conditions.

In [None]:
#train Data
unique_weather = df["Weather_Condition"].unique()

print(len(unique_weather))
print(unique_weather)



To do so, we are going to replace them with a more generic description:

In [None]:
df.loc[df["Weather_Condition"].str.contains("Thunder|T-Storm", na=False), "Weather_Condition"] = "Thunderstorm"
df.loc[df["Weather_Condition"].str.contains("Snow|Sleet|Wintry", na=False), "Weather_Condition"] = "Snow"
df.loc[df["Weather_Condition"].str.contains("Rain|Drizzle|Shower", na=False), "Weather_Condition"] = "Rain"
df.loc[df["Weather_Condition"].str.contains("Wind|Squalls", na=False), "Weather_Condition"] = "Windy"
df.loc[df["Weather_Condition"].str.contains("Hail|Pellets", na=False), "Weather_Condition"] = "Hail"
df.loc[df["Weather_Condition"].str.contains("Fair", na=False), "Weather_Condition"] = "Clear"
df.loc[df["Weather_Condition"].str.contains("Cloud|Overcast", na=False), "Weather_Condition"] = "Cloudy"
df.loc[df["Weather_Condition"].str.contains("Mist|Haze|Fog", na=False), "Weather_Condition"] = "Fog"
df.loc[df["Weather_Condition"].str.contains("Sand|Dust", na=False), "Weather_Condition"] = "Sand"
df.loc[df["Weather_Condition"].str.contains("Smoke|Volcanic Ash", na=False), "Weather_Condition"] = "Smoke"
df.loc[df["Weather_Condition"].str.contains("N/A Precipitation", na=False), "Weather_Condition"] = np.nan

print(df["Weather_Condition"].unique())

Let's check also the Wind_Direction field:

In [None]:
df["Wind_Direction"].unique()


As we can see, we can group the values like we did with Weather_Condition:

In [None]:
df.loc[df["Wind_Direction"] == "CALM", "Wind_Direction"] = "Calm"
df.loc[df["Wind_Direction"] == "VAR", "Wind_Direction"] = "Variable"
df.loc[df["Wind_Direction"] == "East", "Wind_Direction"] = "E"
df.loc[df["Wind_Direction"] == "North", "Wind_Direction"] = "N"
df.loc[df["Wind_Direction"] == "South", "Wind_Direction"] = "S"
df.loc[df["Wind_Direction"] == "West", "Wind_Direction"] = "W"

df["Wind_Direction"] = df["Wind_Direction"].map(lambda x : x if len(x) != 3 else x[1:], na_action="ignore")

df["Wind_Direction"].unique()

In [None]:
# Display the Missing Values in Data :

print("Data : ")
display(df.isnull().sum())


### Split Train and Test Data

In [None]:
total_size=len(df)

train_size=math.floor(0.66*total_size) 
display
#training dataset
train=df.head(train_size)
#test dataset
test=df.head(len(df) -train_size)
display('Total Size:',total_size)
display('Train Size:',train_size)

display('Train Head :',train)
display('Test Head :',test.head())

**Feature scaling: Split Train Data into Predictors(Independent) & Target(Dependent) :**

In [None]:
X_train = train[['Side','Wind_Direction','Day','Month','Year','Hour']]

y_train = train['Severity']
y_train = y_train.to_frame()

X_test = test[['Side','Wind_Direction','Day','Month','Year','Hour']]
y_test = test['Severity']
y_test = y_test.to_frame()


### 6.Data Encoding : Label Encoding, OneHot Encoding

In [None]:
def data_encoding( encoding_strategy , encoding_data , encoding_columns ):
    
    if encoding_strategy == "LabelEncoding":
        print("IF LabelEncoding")
        Encoder = LabelEncoder()
        for column in encoding_columns :
            print("column",column )
            encoding_data[ column ] = Encoder.fit_transform(tuple(encoding_data[ column ]))
        
    elif encoding_strategy == "OneHotEncoding":
        print("ELIF OneHotEncoding")
        encoding_data = pd.get_dummies(encoding_data)
        
    dtypes_list =['float64','float32','int64','int32']
    encoding_data.astype( dtypes_list[0] ).dtypes
    
    return encoding_data

In [None]:
# Quote :
# Applied One Hot Encoding - it will be applied to Object/Categorical Columns Only :
# It's most common to one-hot encode these "object" columns, since they can't be plugged directly into most models. 
# Pandas offers a convenient function called "get_dummies" to get one-hot encodings.
# Many machine learning algorithms cannot operate on label data directly. 
# They require all input variables and output variables to be numeric.
# This means that categorical data must be converted to a numerical form.
# a one-hot encoding can be applied to the integer representation. 
# This is where the integer encoded variable is removed and a new binary variable is added for each unique integer value.
# - Jason Brownlee 

data = ["Red","Blue","Green","Red","Blue","Blue"] 
  
df = pd.DataFrame(data, columns = ['Color']) 
  
print("Before One Hot Encoding : ")
display(df)
print("\nAfter One Hot Encoding : ")
display( pd.get_dummies(df) )

In [None]:
encoding_columns  = [ 'Side','Day', 'Week', 'Month', 'Hour' ]
encoding_strategy = [ "LabelEncoding", "OneHotEncoding"]

X_train_encode = data_encoding( encoding_strategy[1] , X_train , encoding_columns )
X_test_encode =  data_encoding( encoding_strategy[1] , X_test  , encoding_columns )

# Display Encoded Train and Test Features :

display(X_train_encode.head())
display(X_test_encode.head())

**Data Scaling : RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler :**

In [None]:
def data_scaling( scaling_strategy , scaling_data , scaling_columns ):
    
    if    scaling_strategy =="RobustScaler" :
        scaling_data[scaling_columns] = RobustScaler().fit_transform(scaling_data[scaling_columns])
        
    elif  scaling_strategy =="StandardScaler" :
        scaling_data[scaling_columns] = StandardScaler().fit_transform(scaling_data[scaling_columns])
        
    elif  scaling_strategy =="MinMaxScaler" :
        scaling_data[scaling_columns] = MinMaxScaler().fit_transform(scaling_data[scaling_columns])
        
    elif  scaling_strategy =="MaxAbsScaler" :
        scaling_data[scaling_columns] = MaxAbsScaler().fit_transform(scaling_data[scaling_columns])
        
    else :  # If any other scaling send by mistake still perform Robust Scalar
        scaling_data[scaling_columns] = RobustScaler().fit_transform(scaling_data[scaling_columns])
    
    return scaling_data

In [None]:
# RobustScaler is better in handling Outliers :

scaling_strategy = ["RobustScaler", "StandardScaler","MinMaxScaler","MaxAbsScaler"]
X_train = data_scaling( scaling_strategy[0] , X_train_encode , X_train_encode.columns )
X_test  = data_scaling( scaling_strategy [0] , X_test_encode  , X_test_encode.columns )

# Display Scaled Train and Test Features :

display(X_train.head())
display(X_test.head())

### 7. Create Baseline ML Model for Binary Classification Problem :

**1. Logistic regression**

In [None]:
# Logistic regression with default setting.

clf = LogisticRegression(max_iter=10000,random_state=42)
clf.fit(X_train, y_train)
accuracy_train = clf.score(X_train, y_train)
accuracy_test = clf.score(X_test,y_test)
print("Train Accuracy: %.1f%%"% (accuracy_train*100))
print("Test Accuracy: %.1f%%"% (accuracy_test*100))

In [None]:
# Calculate the f1 score

lr_cal = clf.predict(X_test)

# Calculate the f1 score
f1_lr = f1_score(y_test, lr_cal, average='weighted') 
print("F1 Score: %3.4f" %(f1_lr))


**2. Decision Tree**

In [None]:
# Training step, on X_train with y_train
tree_clf = tree.DecisionTreeClassifier(min_samples_split = 5)
tree_clf = tree_clf.fit(X_train,y_train)

tree_accuracy_train = tree_clf.score(X_train, y_train)
tree_accuracy_test = tree_clf.score(X_test,y_test)
print("Train Accuracy: %.1f%%"% (tree_accuracy_train*100))
print("Test Accuracy: %.1f%%"% (tree_accuracy_test*100))

In [None]:
tree_cal = tree_clf.predict(X_test)

# Calculate the f1 score
f1_tree = f1_score(y_test, tree_cal, average='weighted') 
print("F1 Score: %3.4f" %(f1_tree))



**3. Random Forest**

In [None]:
%%time

rf_clf=RandomForestClassifier(n_estimators=10)
rf_clf.fit(X_train,y_train)

train_pred =  rf_clf.predict(X_train)
test_pred =rf_clf.predict(X_test)

rf_train_accuracy = accuracy_score(y_train, train_pred)
rf_test_accuracy = accuracy_score(y_test, test_pred)

print("Train Accuracy: %.1f%%"% (rf_train_accuracy*100))
print("Test Accuracy: %.1f%%"% (rf_test_accuracy*100))

In [None]:
rf_cal = tree_clf.predict(X_test)

# Calculate the f1 score
f1_rf = f1_score(y_test, rf_cal, average='weighted') 
print("F1 Score: %3.4f" %(f1_rf))



**4. XGBoost**

In [None]:
xgb_clf = XGBClassifier(n_estimators=100)

xgb_clf.fit(X_train,y_train)

# predict the target on the train & test  dataset
predict_train = xgb_clf.predict(X_train)
predict_test = xgb_clf.predict(X_test)

# Accuracy Score on train & test dataset

xgb_accuracy_train = accuracy_score(y_train,predict_train)
xgb_accuracy_test = accuracy_score(y_test,predict_test)

print('Train Accuracy: %.1f' %(xgb_accuracy_train*100) )
print('Test Accuracy:%.1f' %(xgb_accuracy_test*100))

In [None]:
xgb_cal = xgb_clf.predict(X_test)

# Calculate the f1 score
f1_xgb = f1_score(y_test, xgb_cal, average='weighted') 
print("F1 Score: %3.4f" %(f1_xgb))


## Report

Calculated the F1_score for above algorithms. The final report table of 4 algorithm as below,


In [None]:
# Report
train_data=[(accuracy_train*100), (tree_accuracy_train*100), (rf_train_accuracy*100),(xgb_accuracy_train*100)]
test_data=[(accuracy_test*100), (tree_accuracy_test*100), (rf_test_accuracy*100),(xgb_accuracy_test*100)]


F1_score = [f1_lr,f1_tree,f1_rf,f1_xgb]

    
df = {'Algorithm': ['LogisticRegression','Decision Tree','Random Forest','XGBOOST'], \
     'Train Data':train_data,'Test Data':test_data,'F1-score': F1_score}

Report = pd.DataFrame(data=df, columns=['Algorithm','Train Data', 'Test Data', 'F1-score'], index=None)
Report