In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Import numpy, pandas, matpltlib.pyplot, sklearn modules and seaborn
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from tqdm import tqdm
import gc

%matplotlib inline
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
plt.style.use('ggplot')

# Import KNeighborsClassifier from sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier

# Import DecisionTreeClassifier from sklearn.tree
from sklearn.tree import DecisionTreeClassifier

# Import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn import preprocessing
from catboost import CatBoostRegressor, Pool, cv, CatBoostClassifier
from xgboost import XGBClassifier

from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Dense

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB 

from xgboost import XGBClassifier
import lightgbm as lgbm
from sklearn.neighbors import KNeighborsClassifier 
from sklearn import svm

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def import_data():
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv("/kaggle/input/us-accidents/US_Accidents_Dec19.csv")
    df = reduce_mem_usage(df)
    return df

In [None]:
# Import the data
df = import_data()
df.info()

In [None]:
# Convert Start_Time and End_Time to datetypes
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
df['End_Time'] = pd.to_datetime(df['End_Time'], errors='coerce')

# Extract year, month, day, hour and weekday
df['Year']=df['Start_Time'].dt.year
df['Month']=df['Start_Time'].dt.strftime('%b')
df['Day']=df['Start_Time'].dt.day
df['Hour']=df['Start_Time'].dt.hour
df['Weekday']=df['Start_Time'].dt.strftime('%a')

# Extract the amount of time in the unit of minutes for each accident, round to the nearest integer
td='Time_Duration(min)'
df[td]=round((df['End_Time']-df['Start_Time'])/np.timedelta64(1,'m'))
# df.info()

In [None]:
# Check if there is any negative time_duration values
df[td][df[td]<=0]

# Drop the rows with td<0

neg_outliers =df[td]<=0

# Set outliers to NAN
df[neg_outliers] = np.nan

# Drop rows with negative td
df.dropna(subset=[td],axis=0,inplace=True)

# Double check to make sure no more negative td
print("Double check to make sure no more negative td: ", df[td][df[td]<=0])

del neg_outliers

# df.info()

In [None]:
# Print time_duration information
print('Max time to clear an accident: {} minutes or {} hours or {} days; Min to clear an accident td: {} minutes.'.format(df[td].max(),round(df[td].max()/60), round(df[td].max()/60/24), df[td].min()))

In [None]:
# Set the list of features to include in Machine Learning
feature_lst=['Source','TMC','Severity','Start_Lng','Start_Lat',
             'Distance(mi)','Side','City','County','State','Timezone',
             'Temperature(F)','Humidity(%)','Pressure(in)', 'Visibility(mi)', 
             'Wind_Direction','Weather_Condition','Amenity','Bump',
             'Crossing','Give_Way','Junction','No_Exit','Railway',
             'Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal',
             'Turning_Loop','Sunrise_Sunset','Hour','Weekday', 'Time_Duration(min)'
            ]

In [None]:
# Select the dataset to include only the selected features
df_sel =df[feature_lst].copy()
# df_sel.info()

del df

In [None]:
def missing_data_stats(df):
    zero_val = (df == 0.00).astype(int).sum(axis=0)
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
    mz_table = mz_table.rename(
    columns = {0 : 'Zero Values', 1 : 'Missing(Null) Values', 2 : '% of Total Missing(Null) Values'})
    mz_table['Total (Zero + Missing) Values'] = mz_table['Zero Values'] + mz_table['Missing(Null) Values']
    mz_table['% Total (Zero + Missing) Values'] = 100 * mz_table['Total (Zero + Missing) Values'] / len(df)
    mz_table['Data Type'] = df.dtypes
    mz_table = mz_table[mz_table.iloc[:,1] != 0].sort_values('% of Total Missing(Null) Values', ascending=False).round(1)
    print ("There are " + str(mz_table.shape[0]) + " columns that have missing values out of ", len(df.columns), " columns.")
    if int(mz_table.shape[0]) != 0:
        return mz_table
    else:
        return None
    
faulty_data = missing_data_stats(df_sel)
faulty_data

In [None]:
def plot_missing_values(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    f, ax = plt.subplots(figsize=(15, 6))
    plt.xticks(rotation='90')
    sns.barplot(x=missing_data.index, y=missing_data['Percent'])
    plt.xlabel('Features', fontsize=15)
    plt.ylabel('Percent of missing values', fontsize=15)
    plt.title('Percent missing data by feature', fontsize=15)

plot_missing_values(df_sel)

In [None]:
df_sel.dropna(subset=df_sel.columns[df_sel.isnull().mean()!=0], how='any', axis=0, inplace=True)
df_sel.shape

In [None]:
# Set county
county='Montgomery'

# Select the state of Pennsylvania
df_county=df_sel.loc[df_sel.County==county].copy()
df_county.drop('County',axis=1, inplace=True)
# df_county.info()

del df_sel

In [None]:
print(df_county.shape)
print(Counter(df_county.Severity))

In [None]:
# Generate dummies for categorical data

target = df_county["Severity"]

df_county_ = df_county.drop(["Severity"], axis=1)

df_sel_dummies = pd.get_dummies(df_county_,drop_first=True)

encoder = preprocessing.LabelEncoder()
encoder.fit(target)
target_ = encoder.transform(target)
# convert integers to dummy variables (i.e. one hot encoded)
# target_ = np_utils.to_categorical(target_)

# df_sel_dummies.info()
# target.info()

In [None]:
# Set the target for the prediction
# target='Severity'

# Create arrays for the features and the response variable

# set X and y
# y = df_sel_dummies[target]
y = target_
# X = df_sel_dummies.drop(target, axis=1)
X = df_sel_dummies

# Split the data set into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# List of classification algorithms
algo_lst=['Logistic Regression',' K-Nearest Neighbors','Decision Trees','Random Forest']

# Initialize an empty list for the accuracy for each algorithm
accuracy_lst=[]

In [None]:
# Logistic regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)

# Get the accuracy score
acc=accuracy_score(y_test, y_pred)

# Append to the accuracy list
accuracy_lst.append(acc)

print("[Logistic regression algorithm] accuracy_score: {:.3f}.".format(acc))
print(classification_report(y_test, y_pred))

In [None]:
# Random Forest algorithm

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

# Get the accuracy score
acc=accuracy_score(y_test, y_pred)

# Append to the accuracy list
accuracy_lst.append(acc)

# Model Accuracy, how often is the classifier correct?
print("[Randon forest algorithm] accuracy_score: {:.3f}.".format(acc))
print(classification_report(y_test, y_pred))

In [None]:
svm_clf = svm.LinearSVC()
svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_test)

# Get the accuracy score
acc=accuracy_score(y_test, y_pred)

# Model Accuracy, how often is the classifier correct?
print("[SVM algorithm] accuracy_score: {:.3f}.".format(acc))
print(classification_report(y_pred, y_test))

In [None]:
NB_clf = GaussianNB()
NB_clf.fit(X_train, y_train)
y_pred=NB_clf.predict(np.array(X_test))

# Get the accuracy score
acc=accuracy_score(y_test, y_pred)

# Model Accuracy, how often is the classifier correct?
print("[Naive Bieas algorithm] accuracy_score: {:.3f}.".format(acc))
print(classification_report(y_pred, y_test))

In [None]:
SGD_clf = SGDClassifier()
SGD_clf.fit(X_train, y_train)
y_pred= SGD_clf.predict(np.array(X_test))

# Get the accuracy score
acc=accuracy_score(y_test, y_pred)

# Model Accuracy, how often is the classifier correct?
print("[Shocastic Gradient Descent algorithm] accuracy_score: {:.3f}.".format(acc))
print(classification_report(y_pred, y_test))

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)

# Get the accuracy score
acc=accuracy_score(y_test, y_pred)

# Model Accuracy, how often is the classifier correct?
print("[KNN algorithm] accuracy_score: {:.3f}.".format(acc))
print(classification_report(y_pred, y_test))

In [None]:
# xgb_clf = XGBClassifier(
#   learning_rate =0.1,
#   n_estimators=1000,
#   max_depth=5,
#   min_child_weight=1,
#   gamma=0,
#   subsample=0.8,
#   colsample_bytree=0.8,
#   objective= 'multi:softmax',
#   num_class =4,
#   nthread=4,
#   scale_pos_weight=1,
#   seed=27)

# xgb_clf.fit(X_train, y_train)
# y_pred = xgb_clf.predict(X_test)

# # Get the accuracy score
# acc=accuracy_score(y_test, y_pred)

# # Model Accuracy, how often is the classifier correct?
# print("[XGBoost algorithm] accuracy_score: {:.3f}.".format(acc))
# print(classification_report(y_pred, y_test))

In [None]:
# def baseline_model():
#     model = Sequential()
#     model.add(Dense(100, input_dim=11911, activation='relu'))
#     model.add(Dense(50, activation='relu'))
#     model.add(Dense(4, activation='softmax'))
#     # compile the keras model
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model

In [None]:
# estimator = KerasClassifier(build_fn=baseline_model, epochs=10, batch_size=28, verbose=1)
# kfold = KFold(n_splits=10, shuffle=True)
# results = cross_val_score(estimator, X, target_, cv=kfold)
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
# feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)

# # Creating a bar plot, displaying only the top k features
# k=10
# sns.barplot(x=feature_imp[:10], y=feature_imp.index[:k])
# # Add labels to your graph
# plt.xlabel('Feature Importance Score')
# plt.ylabel('Features')
# plt.title("Visualizing Important Features")
# plt.legend()
# plt.show()

In [None]:
# # List top k important features
# k=20
# feature_imp.sort_values(ascending=False)[:k]

In [None]:
# # Create a selector object that will use the random forest classifier to identify
# # features that have an importance of more than 0.03
# sfm = SelectFromModel(clf, threshold=0.03)

# # Train the selector
# sfm.fit(X_train, y_train)

# feat_labels=X.columns

# # Print the names of the most important features
# for feature_list_index in sfm.get_support(indices=True):
#     print(feat_labels[feature_list_index])

In [None]:
# # Transform the data to create a new dataset containing only the most important features
# # Note: We have to apply the transform to both the training X and test X data.
# X_important_train = sfm.transform(X_train)
# X_important_test = sfm.transform(X_test)

# # Create a new random forest classifier for the most important features
# clf_important = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# # Train the new classifier on the new dataset containing the most important features
# clf_important.fit(X_important_train, y_train)

In [None]:
# # Apply The Full Featured Classifier To The Test Data
# y_pred = clf.predict(X_test)

# # View The Accuracy Of Our Full Feature Model
# print('[Randon forest algorithm -- Full feature] accuracy_score: {:.3f}.'.format(accuracy_score(y_test, y_pred)))

# # Apply The Full Featured Classifier To The Test Data
# y_important_pred = clf_important.predict(X_important_test)

# # View The Accuracy Of Our Limited Feature Model
# print('[Randon forest algorithm -- Limited feature] accuracy_score: {:.3f}.'.format(accuracy_score(y_test, y_important_pred)))

In [None]:
# xgb_model = XGBClassifier()
# xgb_model.fit(X_train,y_train)

# # make predictions for test data

# y_pred = model.predict(X_test)
# # predictions = [round(value) for value in y_pred]

In [None]:
# type(y_pred)