In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Installing pyspark
!pip install pyspark
!pip install findspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('Spark Application').getOrCreate()

In [None]:
# df = spark.read.csv("/kaggle/input/us-accidents/US_Accidents_Dec20_Updated.csv",inferSchema=True, header=True)
df = spark.read.csv("/kaggle/input/us-accidents/US_Accidents_Dec20_Updated.csv",inferSchema=True, header=True).limit(1500000)

In [None]:
df.head()

In [None]:
df.printSchema()

### Preprocessing and Data Explorations

In [None]:
from pyspark.sql.functions import *

In [None]:
cols_to_drop = ['ID','Start_Time','End_Time','Description','Street',
                'Weather_Timestamp','Zipcode','County','City','Airport_Code','Precipitation(in)'] 
# Weather_Condition to be processed again after cleaning

In [None]:
df = df.drop(*cols_to_drop)

In [None]:
print(df.columns)

In [None]:
df.head()

In [None]:
# Dict_Null = {col:df.filter(isnan(df[col[0]])).count() for col in df.dtypes if col[1] != 'boolean'}
# Dict_Null

In [None]:
# Replace nan with Nulls
columns = df.dtypes
for cols, typ in columns:
    if typ != 'boolean':
        df = df.withColumn(cols,when(isnan(col(cols)),None).otherwise(col(cols)))

In [None]:
# Dict_Null = {col:df.filter(isnan(df[col[0]])).count() for col in df.dtypes if col[1] != 'boolean'}
# Dict_Null

In [None]:
# Dict_Null2 = {col:df.filter(isnull(df[col[0]])).count() for col in df.dtypes }
# Dict_Null2

In [None]:
label = 'Severity'
string_cols =  [cols[0] for cols in df.dtypes if cols[1] == "string" ]
num_cols = [cols[0] for cols in df.dtypes if cols[1] == "int" or cols[1] == "double" ]
num_cols.remove(label)
bool_cols = [cols[0] for cols in df.dtypes if cols[1] == "boolean"]

# string_cols,
# num_cols
# bool_cols

In [None]:
df = df.fillna("unknown",string_cols)
df = df.fillna(0,num_cols)

In [None]:
%%time 
Dict_Null3 = {col:df.filter(isnull(df[col[0]])).count() for col in df.dtypes }
Dict_Null3

In [None]:
for c in bool_cols:
    df = df.withColumn(c,col(c).cast("integer"))    

In [None]:
(train, test) = df.randomSplit([0.7, 0.3])
train.count(), test.count()
# df.printSchema()

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
sIndexer = [StringIndexer(inputCol=cols, outputCol=cols+"Index") for cols in  string_cols]

In [None]:
assembler = VectorAssembler(inputCols=[s.getOutputCol() for s in sIndexer]+bool_cols+num_cols, outputCol='features')

In [None]:
# reg = 0,
lr = LogisticRegression(featuresCol="features", labelCol=label)

In [None]:
pipeline = Pipeline(stages=sIndexer+[assembler, lr])
model = pipeline.fit(train)


In [None]:
preds = model.transform(train)
print("Prediction")
preds.select("Severity","prediction").show(20)

# evaluate the accuracy of the model using the test set
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', labelCol="Severity")
accuracy = evaluator.evaluate(preds)

print()
print('#####################################')
print(f"Accuracy is {accuracy}")
print('#####################################')
print()


In [None]:
!pip install onnx
!pip install onnxmltools
!pip install mlflow
!pip install onnxruntime

In [None]:
from onnxmltools import convert_sparkml
from onnxmltools.convert.sparkml.utils import buildInitialTypesSimple
from onnxmltools.convert.common.data_types import StringTensorType, FloatTensorType
import mlflow.onnx

In [None]:
import os
if not os.path.exists('model_onnx/'):
    os.mkdir('model_onnx/')
    
if not os.path.exists('model_spark/'):
    os.mkdir('model_spark/')
    
if not os.path.exists('model_mlflow'):
    os.mkdir('model_mlflow')

In [None]:
%%time
# Native ONNX framework
initial_types = buildInitialTypesSimple(df.drop('Severity'))
# initial_types
onnx_model = convert_sparkml(model, 'Pyspark model', initial_types, spark_session = spark)

In [None]:
%%time
with open(os.path.join("model_onnx/", "model.onnx"), "wb") as f:
    f.write(onnx_model.SerializeToString())

In [None]:
%%time 
# Pyspark native
model.save('model_spark/spark_model')

In [None]:
%%time

# ONNX on MLFlow
mlflow.onnx.save_model(onnx_model,"model_mlflow/mlflow_model")

In [None]:
!ls -ltrh

In [None]:
!pip install py-cpuinfo

In [None]:
import platform
platform.processor()
# import cpuinfo

# 'Intel64 Family 6 Model 23 Stepping 6, GenuineIntel'

In [None]:
import cpuinfo
cpuinfo.get_cpu_info()['brand_raw']

In [None]:
# from pyspark.sql.functions import array_contains, col, explode
# def BinaryEncoder(df, column_name):
#     col_vals = [
#         x[0] for x in 
#         df.select(column_name).distinct().orderBy(column_name).collect()]
#     print(col_vals)
    
    
    
# %time BinaryEncoder(df_1, "Side")