In [None]:
!pip install pyspark
!pip install findspark

In [None]:
import findspark
from collections import Counter
import pandas as pd
import numpy as np
import scipy as sp
import imblearn as im
import pyspark

In [None]:
from pyspark.sql import functions as F
from imblearn.under_sampling import RandomUnderSampler
from pyspark.sql.types import *

In [None]:
sc = pyspark.SparkContext(appName="US-Accidents")

In [None]:
sql = pyspark.SQLContext(sc)

In [None]:
df = sql.read.format("com.databricks.spark.csv").option("header", "true").load("../input/us-accidents")

In [None]:
selected_df = df.select("ID","Start_Time","Start_Lat","Start_Lng", "Side", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "Wind_Direction", "Wind_Speed(mph)", "Weather_Condition", "Sunrise_Sunset")

In [None]:
def to_null(c):
    return F.when(~(F.col(c).isNull() | F.isnan(F.col(c)) | (F.trim(F.col(c)) == "")), F.col(c))


selected_df = selected_df.select([to_null(c).alias(c) for c in selected_df.columns]).na.drop()

In [None]:
selected_df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in selected_df.columns]).show()

In [None]:
selected_df.select(['Temperature(F)', 'Visibility(mi)']).summary().show()

In [None]:
#selected_df.show()
selected_df = selected_df.withColumn("ID",selected_df.ID.cast(DoubleType()))
selected_df = selected_df.withColumn("Start_Time", F.to_timestamp("Start_Time"))
selected_df = selected_df.withColumn("Start_Time", selected_df.Start_Time.cast(DoubleType()))
selected_df = selected_df.withColumn("Start_Lat", selected_df.Start_Lat.cast(DoubleType()))
selected_df = selected_df.withColumn("Start_Lng", selected_df.Start_Lng.cast(DoubleType()))
selected_df = selected_df.withColumn("Side", F.when(selected_df.Side == "R", "0").when(selected_df.Side == "L","1"))
selected_df = selected_df.withColumn("Side", selected_df.Side.cast(DoubleType()))
selected_df = selected_df.withColumn("Temperature(F)", selected_df["Temperature(F)"].cast(DoubleType()))
selected_df = selected_df.withColumn("Humidity(%)", selected_df["Humidity(%)"].cast(DoubleType()))
selected_df = selected_df.withColumn("Visibility(mi)", selected_df["Visibility(mi)"].cast(DoubleType()))
#selected_df = selected_df.withColumn("Wind_Direction", selected_df.Wind_Direction.cast(DoubleType()))
selected_df = selected_df.withColumn("Wind_Speed(mph)", selected_df["Wind_Speed(mph)"].cast(DoubleType()))
#selected_df = selected_df.withColumn("Weather_Condition", selected_df.Weather_Condition.cast(DoubleType()))
selected_df = selected_df.withColumn("Sunrise_Sunset", F.when(selected_df.Sunrise_Sunset == "Night", "1").when(selected_df.Sunrise_Sunset == "Day","0"))
selected_df = selected_df.withColumn("Sunrise_Sunset", selected_df["Sunrise_Sunset"].cast(DoubleType()))

In [None]:
temp_wind_direction = selected_df.groupBy("Wind_Direction").count()

In [None]:
temp_wind_direction = temp_wind_direction.select("Wind_Direction")

In [None]:
from pyspark.sql.window import Window
w = Window.orderBy("Wind_Direction")
wind_direction_table = temp_wind_direction.select("Wind_Direction").withColumn("id_wind_direction", F.row_number().over(w))

In [None]:
selected_df = selected_df.join(wind_direction_table, on = "Wind_Direction", how = "inner")

In [None]:
temp = selected_df.groupBy("Weather_Condition").count()

In [None]:
w = Window.orderBy("Weather_Condition") 
weather_con_table = temp.select("Weather_Condition").withColumn("id_weather_condition", F.row_number().over(w))

In [None]:
selected_df = selected_df.join(weather_con_table, on = "Weather_Condition", how = "inner")

In [None]:
selected_df = selected_df.withColumn("id_weather_condition", selected_df.id_weather_condition.cast(DoubleType()))

In [None]:
selected_df = selected_df.select("ID","Start_Time","Start_Lat","Start_Lng", "Side", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition", "Sunrise_Sunset")

In [None]:
selected_df.printSchema()

In [None]:
selected_df = selected_df.drop("ID")

In [None]:
selected_df = selected_df.select("*").where(selected_df['Side'] == 0.0).where(selected_df['Sunrise_Sunset'] == 0.0)

In [None]:
selected_df = selected_df.drop("Sunrise_Sunset", "Side")

In [None]:
selected_df1 = selected_df.select("*").filter("id_weather_condition < 23  OR id_weather_condition > 69 AND id_weather_condition < 70")

In [None]:
selected_df1 = selected_df1.select("*").filter((selected_df1["Humidity(%)"] > 28.5)&(selected_df1["Humidity(%)"] < 73) )

In [None]:
selected_df1 = selected_df1.withColumn("Start_Time", selected_df.Start_Time.cast(TimestampType()))

In [None]:
selected_df1.count()

In [None]:
selected_df1 = selected_df1.select("*").filter(selected_df1['Temperature(F)'] < 100 )

In [None]:
selected_2016 = selected_df1.select('*').filter(selected_df1['Start_Time'] <= F.lit('2016-06-01 00:00:00'))

In [None]:
selected_2016 = selected_2016.withColumn("Start_Time", selected_2016.Start_Time.cast(DoubleType()))

In [None]:
selected_2016.count()

In [None]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

In [None]:
selected_2016.printSchema()

In [None]:
FEATURES_COL = ["Start_Time","Start_Lat","Start_Lng", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition"]

In [None]:
FEATURES_COL_1 = ["Temperature(F)", "Humidity(%)", "Visibility(mi)"]

In [None]:
selected_2016.printSchema()

In [None]:
# unlist = udf(lambda x: round(float(list(x)[0]),12), DoubleType())

In [None]:
pd_df = selected_2016.toPandas()

In [None]:
from sklearn.preprocessing import StandardScaler 
features = pd_df[FEATURES_COL_1]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)

In [None]:
pd_df[FEATURES_COL_1] = features

In [None]:
#convert from pandas df into pyspark df
from pyspark.sql.types import *

# Auxiliar functions
def equivalent_type(f):
    if f == 'datetime64[ns]': return TimestampType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)

# Given pandas dataframe, it will return a spark's dataframe.
def pandas_to_spark(pandas_df):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    for column, typo in zip(columns, types): 
      struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sql.createDataFrame(pandas_df, p_schema)

spark_df = pandas_to_spark(pd_df)

In [None]:
spark_df1 = spark_df.select("Temperature(F)", "Humidity(%)", "Visibility(mi)")

In [None]:
vecAssembler = VectorAssembler(inputCols=FEATURES_COL_1, outputCol="features")
scaled_final1 = vecAssembler.transform(spark_df1)

In [None]:
scaled_2016.printSchema()

In [None]:
final_scaled = scaled_final.select("selected_features")

In [None]:
scaled_data = scaled_2016.select("selected_features")

In [None]:
MinMaxScalerizer=MinMaxScaler().setMin(0).setMax(1).setInputCol("selected_features").setOutputCol("features")
scaler = MinMaxScalerizer.fit(final_scaled)
features = scaler.transform(final_scaled)

In [None]:
features.printSchema()

In [None]:
features_1 = features.select("features")

In [None]:
final_data = features.select("features")

In [None]:
scaled_nyo = scaled_final1.select("features")

In [None]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeans

# Trains a bisecting k-means model.
# bkm = BisectingKMeans().setK(3).setSeed(1)
# model = bkm.fit(df_final)

# kmeans = KMeans().setK(2).setSeed(1)
# model = kmeans.fit(df_final)

kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(scaled_nyo)
# centers = model.clusterCenters()


In [None]:
del nyo

In [None]:
# temp_wind_direction.show()

In [None]:
# from pyspark.sql.window import Window
# w = Window.orderBy("Wind_Direction")
# wind_direction_table = temp_wind_direction.select("Wind_Direction").withColumn("id_wind_direction", row_number().over(w))

In [None]:
# wind_direction_table.show(50)

In [None]:
# selected_df = selected_df.join(wind_direction_table, on = "Wind_Direction", how = "inner")

In [None]:
# selected_df.printSchema()

In [None]:
# from pyspark.sql.window import Window 
# w = Window.orderBy("Weather_Condition") 
# weather_con_table = temp.select("Weather_Condition").withColumn("id_weather_condition", row_number().over(w))

In [None]:
# weather_con_table.show(150, False)

In [None]:
# selected_df = selected_df.join(weather_con_table, on = "Weather_Condition", how = "inner")

In [None]:
# selected_df.select("Weather_Condition","id_weather_condition").show()

In [None]:
#from pyspark.sql.functions import isnan, when, count, col
#real_df.select([count(when(isnan(c), c)).alias(c) for c in real_df.columns]).show()
#nama_colom = ("ID", "Severity","Start_Time","Start_Lat","Start_Lng", "Side", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "Wind_Direction", "Wind_Speed(mph)", "Weather_Condition", "Sunrise_Sunset")
#for i in range(len(nama_colom)):
#  selected_df = selected_df.where(col(nama_colom[i]).isNotNull())

In [None]:
# selected_df = selected_df.withColumn("id_weather_condition", selected_df.id_weather_condition.cast(DoubleType()))

In [None]:
#selected_df = selected_df.select("ID","Start_Time","Start_Lat","Start_Lng", "Side", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition", "Sunrise_Sunset")

In [None]:
#selected_df.printSchema()

In [None]:
# selected_df.select("Sunrise_Sunset").groupby(["Sunrise_Sunset"]).count().show()

In [None]:
#selected_df.printSchema()

In [None]:
# selected_df.select("*").groupby(["Side"]).count().show()

In [None]:
#selected_df.dropna()

In [None]:
#selected_df.show(5)

In [None]:
#selected_df = selected_df.drop("ID")

In [None]:
#selected_df.show(5)

In [None]:
selected_df = selected_df.select("*").where(selected_df['Side'] == 0.0).where(selected_df['Sunrise_Sunset'] == 0.0)

In [None]:
# selected_df.summary().show()

In [None]:
selected_df.drop("Sunrise_Sunset")

In [None]:
selected_df.drop("Side")

In [None]:
#up_avg = selected_df.select('*').filter(selected_df["Wind_Speed(mph)"] > 8)
#down_avg = selected_df.select("*").filter(selected_df["Wind_Speed(mph)"] < 8)

In [None]:
#up_avg.select("*").groupby("Side").count().show()

In [None]:
#down_avg.select("*").groupby("side").count().show()

In [None]:

# weather_gram = selected_df.select("id_weather_condition").rdd.flatMap(lambda x: x).histogram(11)

In [None]:
# import pandas as pd
# pd.DataFrame(
#     list(zip(*weather_gram)), 
#     columns=['bin', 'frequency']
# ).set_index(
#     'bin'
# ).plot(kind='bar');

In [None]:
# humidity_gram = selected_df.select("Humidity(%)").filter("id_weather_condition < 23  OR id_weather_condition > 67.5 AND id_weather_condition < 69").rdd.flatMap(lambda x: x).histogram(11)

In [None]:
# pd.DataFrame(
#     list(zip(*humidity_gram)), 
#     columns=['bin', 'frequency']
# ).set_index(
#     'bin'
# ).plot(kind='bar');

In [None]:
selected_df1 = selected_df.select("*").filter("id_weather_condition < 23  OR id_weather_condition > 69 AND id_weather_condition < 70")

In [None]:
selected_df1 = selected_df1.select("*").filter((selected_df1["Humidity(%)"] > 28.5)&(selected_df1["Humidity(%)"] < 73) )

In [None]:
# selected_df1.summary().show()

In [None]:
selected_df1 = selected_df1.withColumn("Start_Time", selected_df.Start_Time.cast(TimestampType()))

In [None]:
# selected_df1.printSchema()

In [None]:
# selected_df1.show()

In [None]:
selected_df1 = selected_df1.select("*").filter(selected_df1['Temperature(F)'] < 100 )

In [None]:
# selected_df1.summary().show()

In [None]:
selected_df1 = selected_df1.drop("Side")

In [None]:
selected_df1= selected_df1.drop("Sunrise_Sunset")

In [None]:
# selected_df1.show()

In [None]:
# selected_2020 = selected_df1.select('*').filter("Start_Time > '2020'")
# selected_2019 = selected_df1.select('*').filter("Start_Time > '2019' AND Start_Time < '2020'")
# selected_2018 = selected_df1.select("*").filter("Start_Time > '2018' AND Start_Time < '2019'")
# selected_2017 = selected_df1.select('*').filter("Start_Time > '2017' AND Start_Time < '2018'")
# selected_2016 = selected_df1.select('*').filter("Start_Time < '2016-02-01'")

In [None]:
# import pyspark.sql.functions as F
selected_2016 = selected_df1.select('*').filter(selected_df1['Start_Time'] <= F.lit('2016-06-01 00:00:00'))
# selected_20161 = selected_df1.select('*').filter(selected_df1['Start_Time'] < F.lit('2017-01-01 00:00:00'))

In [None]:
# selected_2016.show()

In [None]:
# # selected_2020.summary().show()
# # selected_2019.summary().show()
# # selected_2018.summary().show()
# # selected_2017.summary().show()
# selected_2016.summary().show()

In [None]:
# # selected_2020 = selected_2020.withColumn("Start_Time", selected_2020.Start_Time.cast(DoubleType()))
# # selected_2019 = selected_2019.withColumn("Start_Time", selected_2019.Start_Time.cast(DoubleType()))
# # selected_2018 = selected_2018.withColumn("Start_Time", selected_2018.Start_Time.cast(DoubleType()))
# # selected_2017 = selected_2017.withColumn("Start_Time", selected_2017.Start_Time.cast(DoubleType()))
selected_2016 = selected_2016.withColumn("Start_Time", selected_2016.Start_Time.cast(DoubleType()))
# selected_20161 = selected_20161.withColumn("Start_Time", selected_2016.Start_Time.cast(DoubleType()))

In [None]:
#note
#denga melakukan pemilihan n standard dev yang tinggi sehingga dapat melakukan analisa terhadap gap antar data 
# sehingga memilih data yang paling optimal dan memiliki karakter tertentu 
# setelah menemukan gap2 yang memiliki karakter, akan dilakukan pemilihan value yang sering terjadi kecelakaan. 
# setelah melakukan pemilihan 

In [None]:
# selected_df1.printSchema()

In [None]:
#Pemisah menggunakan Pandas

In [None]:
# # pandasDF1 = selected_2020.toPandas()
# # pandasDF2 = selected_2019.toPandas()
# # pandasDF3 = selected_2018.toPandas()
# #pandasDF4 = selected_2017.toPandas()
# pandasDF5 = selected_2016.toPandas()
# pandasDF6 = selected_20161.toPandas()

In [None]:
# import sklearn
# from sklearn.preprocessing import MinMaxScaler

In [None]:
# print(pandasDF5)

In [None]:
# features = ["Start_Time","Start_Lat","Start_Lng", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition"]

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler(feature_range=(0,1))

In [None]:
# df1_scaled20 = scaler.fit_transform(pandasDF1)

In [None]:
# # df1_scaled19 = scaler.fit_transform(pandasDF2)
# # df1_scaled18 = scaler.fit_transform(pandasDF3)
# # df1_scaled17 = scaler.fit_transform(pandasDF4)
# df1_scaled16 = scaler.fit_transform(pandasDF5)

In [None]:
# print(df1_scaled16)

In [None]:
# # print(df1_scaled16)
# print(type(df1_scaled16))

In [None]:
# df1_scaled16 = pd.DataFrame(df1_scaled16, columns =["Start_Time","Start_Lat","Start_Lng", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition"])

In [None]:
# df1_scaled20 = pd.DataFrame(df1_scaled20, columns =["Start_Time","Start_Lat","Start_Lng", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition"])
# df1_scaled19 = pd.DataFrame(df1_scaled19, columns =["Start_Time","Start_Lat","Start_Lng", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition"])
# df1_scaled18 = pd.DataFrame(df1_scaled18, columns =["Start_Time","Start_Lat","Start_Lng", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition"])
# df1_scaled17 = pd.DataFrame(df1_scaled17, columns =["Start_Time","Start_Lat","Start_Lng", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition"])

In [None]:
# pandasDF5.info(verbose=True)

In [None]:
# import numpy as np 
# import pandas as pd 
# from mlxtend.frequent_patterns import apriori, association_rules
# from datetime import datetime
# from pandas import DataFrame
# import missingno as msno
# import scipy.cluster.hierarchy as sch
# import plotly.figure_factory as ff
# from sklearn.cluster import AgglomerativeClustering 
# import matplotlib.pyplot as plt
# from sklearn.metrics import silhouette_score 


In [None]:
# print(df1_scaled16)

In [None]:
# setelah ini out of memory error

In [None]:
#dendrogram = sch.dendrogram(sch.linkage(df1_scaled16, method='ward'))

In [None]:
# X = df1_scaled16.iloc[:,[0,3]].values
# Y = pandasDF6.iloc[:,[0,3]].values

###### 

In [None]:
# print(X)
# print(X.shape)
# print(pandasDF6.shape)

In [None]:
# dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))

In [None]:
# ac2 = AgglomerativeClustering(n_clusters = 2, affinity = 'euclidean', linkage ='ward')
# plt.scatter(X[:,0], X[:,1],  
#            c = ac2.fit_predict(X), cmap ='rainbow') 
# plt.show() 

In [None]:
# ac3 = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage ='ward')
# plt.scatter(X[:,0], X[:,1],  
#            c = ac3.fit_predict(X), cmap ='rainbow') 
# plt.show() 


In [None]:

# ac4 = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage ='ward')
# plt.scatter(X[:,0], X[:,1],  
#            c = ac4.fit_predict(X), cmap ='rainbow') 
# plt.show() 



In [None]:
#Pemisah menggunakan pyspark

In [None]:
# import itertools
# from itertools import combinations
# features = ["Start_Time","Start_Lat","Start_Lng", "Side", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition", "Sunrise_Sunset"]
# print(list(combinations(features, 2)))

In [None]:
# selected_df.printSchema()

In [None]:
# corr_list = []
# for item in list(combinations(features, 2)):
#    corr_list.append({
#        'feature1': item[0],
#        'feature2': item[1],
#        'corr': selected_2016.stat.corr(item[0], item[1])
#    })
#    print(item[0] + " & " + item[1] + " ---> " + str(selected_2016.stat.corr(item[0], item[1])))

Pisah Real One

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import LongType
import copy

In [None]:
_schema = copy.deepcopy(selected_2016.schema)

In [None]:
_X = selected_2016.rdd.zipWithIndex().map(lambda l: list(l[0]) + [l[1]]).toDF(_schema)

In [None]:

_selected_2016 = selected_2016.rdd.zipWithIndex().map(lambda l: list(l[0]) + [l[1]]).toDF(_schema)

In [None]:
print('Schema of X: ' + str(selected_2016.schema))
print('Schema of _X: ' + str(_X.schema))
_X.printSchema()

In [None]:
#  selected_df.show(5)

In [None]:
_selected_2016 = selected_2016
_selected_2016.show(5)

In [None]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

In [None]:
unlist = udf(lambda x: round(float(list(x)[0]),12), DoubleType())

In [None]:
# ## Coba Scaling menjadi 1 feature yang dinama e features
vecAssembler = VectorAssembler(inputCols=["Start_Time","Start_Lat","Start_Lng", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition"], outputCol="features",
   handleInvalid="keep")
scaled_2016 = vecAssembler.transform(selected_2016)

In [None]:
# scaled_2016.select("features").show()

In [None]:
from pyspark.ml.feature import MinMaxScaler
# Let us create an object of MinMaxScaler class
MinMaxScalerizer=MinMaxScaler().setMin(0).setMax(1).setInputCol("features").setOutputCol("Scaled_features")
scaled_2016=MinMaxScalerizer.fit(scaled_2016).transform(scaled_2016)

In [None]:
scaled_2016.printSchema()

In [None]:
scaled_2016.select("Scaled_features").show(10, False)

In [None]:
# coba = scaled_2016.select("features")

In [None]:
# coba1 = scaled_2016.select("features")

In [None]:
# coba.show()

In [None]:
# c = Window.orderBy("features")
# coba = coba.withColumn("id_coba", row_number().over(c))

In [None]:
# coba.show(5)

In [None]:
# from pyspark.mllib.clustering import KMeans
# clusters = KMeans.train(coba1, 2, maxIterations=10, initializationMode='random')

In [None]:
# from pyspark.ml.clustering import KMeans
# from pyspark.ml.evaluation import ClusteringEvaluator

# kmeans = KMeans().setK(6).setSeed(1)
# model = kmeans.fit(coba.select('features'))
# predictions = model.transform(coba)
# silhouette = evaluator.evaluate(predictions)
# print("Silhouette with squared euclidean distance = " + str(silhouette))

In [None]:
coba = scaled_2016.select("Scaled_features")
# coba.show(5,False)

In [None]:
coba1 = coba.selectExpr("Scaled_features as features")
coba1.printSchema()
coba1.show(10,False)

In [None]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Trains a bisecting k-means model.
bkm = BisectingKMeans().setK(3).setSeed(1)
model = bkm.fit(coba1)

# Make predictions
# predictions = model.transform(scaled_2016["Scaled_features"])

# # Evaluate clustering by computing Silhouette score
# evaluator = ClusteringEvaluator()

# silhouette = evaluator.evaluate(predictions)
# print("Silhouette with squared euclidean distance = " + str(silhouette))

# # Shows the result.
# print("Cluster Centers: ")
# centers = model.clusterCenters()
# for center in centers:
#     print(center)

In [None]:
# features = ["Start_Time","Start_Lat","Start_Lng", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition"]
# for i in features:
#     #VectorAssembler Transformation - Converting column to vector type
#     assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect")

#     #MinMaxScaler Transformation
#     scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled")

#     #Pipeline of VectorAssembler and MinMaxScaler
#     pipeline = Pipeline(stages=[assembler, scaler])

#     #Fitting pipeline on dataframe
#     selected_2016 = pipeline.fit(selected_2016).transform(selected_2016).withColumn(i+"_Scaled", unlist(i+"_Scaled")).drop(i+"_Vect")



In [None]:
# _selected_2016.printSchema()

In [None]:
# from pyspark.ml import Pipeline
# from pyspark.ml.feature import MinMaxScaler


In [None]:
# columns_to_scale = features
# assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
# scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
# pipeline = Pipeline(stages=assemblers + scalers)
# scalerModel = pipeline.fit(selected_2016)
# scaledData = scalerModel.transform(selected_2016)

In [None]:
# selected_df1.summary().show()
    

In [None]:
# scaledData.show(5)

In [None]:
# Start_Lat,Start_Lng,Temperature(F),Wind_speed(mph)

In [None]:
# _normalize_selected_2016 = _selected_2016.select("Start_Lat", "Start_Lng", "Temperature(F)", "Wind_speed(mph)")
# _normalize_selected_2016 = _selected_2016

In [None]:
# _normalize_selected_2016.printSchema()

In [None]:
# col_name = ["Start_Time","Start_Lat","Start_Lng"]


In [None]:
#"Start_Time","Start_Lat","Start_Lng", "Side", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "id_wind_direction", "Wind_Speed(mph)", "id_weather_condition", "Sunrise_Sunset"

In [None]:
# assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in features]
# scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled").setMax(1).setMin(0) for col in features]
# pipeline = Pipeline(stages=assemblers + scalers)
# scalerModel = pipeline.fit(selected_2016)

# scaledData = scalerModel.transform(selected_2016)

In [None]:
# col_name2 = ["Temperature(F)", "Humidity(%)", "Visibility(mi)"]

In [None]:
# assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in col_name2]
# scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled").setMax(1).setMin(0) for col in col_name2]
# pipeline = Pipeline(stages=assemblers + scalers)
# scalerModel = pipeline.fit(selected_df1)

# scaledData2 = scalerModel.transform(selected_df1)

In [None]:
# col_name3 = ["id_wind_direction","id_weather_condition"]

In [None]:
# assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in col_name3]
# scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled").setMax(1).setMin(0) for col in col_name3]
# pipeline = Pipeline(stages=assemblers + scalers)
# scalerModel = pipeline.fit(selected_df1)

# scaledData3 = scalerModel.transform(selected_df1)

In [None]:
# scaledData.printSchema()

In [None]:
# scaledData2.printSchema()

In [None]:
# scaledData3.printSchema()

In [None]:
#scaledData.show(5)

In [None]:
#scaledData.printSchema()

In [None]:
# from pyspark.ml.linalg import Vectors
# from pyspark.sql.functions import udf
# from pyspark.sql.types import DoubleType

In [None]:
# col_name_scaledData = ["Start_Time_scaled","Start_Lat_scaled","Start_Lng_scaled", "Temperature(F)_scaled", "Humidity(%)_scaled", "Visibility(mi)_scaled", "id_wind_direction_scaled", "Wind_Speed(mph)_scaled", "id_weather_condition_scaled"]

In [None]:
# for col in range(9):
#     assemblers = VectorAssembler(inputCols=[col_name_scaledData[col]], outputCol=col_name_scaledData[col] + "_final")
#     scaledData1 = assemblers.transform(scaledData)
#     unlist = udf(lambda x: float(list(x)[0]), DoubleType())
#     scaledData1 = scaledData1.withColumn(col_name_scaledData[col], unlist(col_name_scaledData[col]+"_final"))

In [None]:
scaledData1.printSchema()

In [None]:
# col_name_scaledData2 = ["Temperature(F)_scaled", "Humidity(%)_scaled", "Visibility(mi)_scaled"]

In [None]:
# for col in range(3):
#     assemblers = VectorAssembler(inputCols=[col_name_scaledData2[col]], outputCol=col_name_scaledData2[col] + "_final")
#     scaledData2 = assemblers.transform(scaledData2)
#     unlist = udf(lambda x: float(list(x)[0]), DoubleType())
#     scaledData2 = scaledData2.withColumn(col_name_scaledData2[col], unlist(col_name_scaledData2[col]+"_final"))

In [None]:
# scaledData2.printSchema()

In [None]:
# col_name_scaledData3 = ["id_wind_direction_scaled","id_weather_condition_scaled"]

In [None]:
# for col in range(2):
#     assemblers = VectorAssembler(inputCols=[col_name_scaledData3[col]], outputCol=col_name_scaledData3[col] + "_final")
#     scaledData3 = assemblers.transform(scaledData3)
#     unlist = udf(lambda x: float(list(x)[0]), DoubleType())
#     scaledData3 = scaledData3.withColumn(col_name_scaledData3[col], unlist(col_name_scaledData3[col]+"_final"))

In [None]:
# scaledData3.printSchema()

In [None]:
#scaledData3.show(5)

In [None]:
# TakeData1 = scaledData1.select("Start_Time_scaled","Start_Lat_scaled","Start_Lng_scaled")

In [None]:
# TakeData1.show(5)

In [None]:
# TakeData2 = scaledData2.select("Temperature(F)_scaled", "Humidity(%)_scaled", "Visibility(mi)_scaled")

In [None]:
# TakeData2.printSchema()

In [None]:
# TakeData3 = scaledData3.select("id_wind_direction_scaled", "id_weather_condition_scaled")

In [None]:
# TakeData3.printSchema()

In [None]:
# TakeData4 = selected_df1.select("Side")

In [None]:
# TakeData4.printSchema()

In [None]:
# DataFinal = scaledData1.select("Start_Time_scaled","Start_Lat_scaled","Start_Lng_scaled", "Temperature(F)_scaled", "Humidity(%)_scaled", "Visibility(mi)_scaled", "id_wind_direction_scaled", "Wind_Speed(mph)_scaled", "id_weather_condition_scaled")

In [None]:
# DataFinal = DataFinal.join(TakeData4)

In [None]:
# DataFinal.printSchema()

In [None]:
# DataFinal = DataFinal.join(TakeData2)

In [None]:
# DataFinal = DataFinal.join(TakeData3)

In [None]:
# DataFinal.printSchema()

In [None]:
# DataVector = scaledData1.select("Start_Time_scaled_final","Start_Lat_scaled_final","Start_Lng_scaled_final", "Temperature(F)_scaled_final", "Humidity(%)_scaled_final", "Visibility(mi)_scaled_final", "id_wind_direction_scaled_final", "Wind_Speed(mph)_scaled_final", "id_weather_condition_scaled_final")

In [None]:
# DataVector.printSchema()

In [None]:
# from pyspark.ml.clustering import BisectingKMeans
# from pyspark.ml.evaluation import ClusteringEvaluator
# from pyspark.ml.evaluation import *


In [None]:
# vecAssembler = VectorAssembler(inputCols=DataFinal.columns, outputCol="features")
# vector_df = vecAssembler.transform(DataFinal)

In [None]:
# bkm = BisectingKMeans().setK(2).setSeed(1)

In [None]:
# model = bkm.fit(DataFinal)

In [None]:
#DataFinal.printSchema()

In [None]:
# assembler = VectorAssembler(
#    inputCols=["Start_Time_scaled","Start_Lat_scaled","Start_Lng_scaled","Temperature(F)_scaled", "Humidity(%)_scaled", "Visibility(mi)_scaled","id_wind_direction_scaled", "id_weather_condition_scaled"],
#    outputCol="features")


In [None]:

# output = assembler.transform(DataFinal)


In [None]:
# output.printSchema()


In [None]:
# coba1 = output.select("features")

In [None]:
# from pyspark.sql.functions import monotonically_increasing_id 

# coba1 = coba1.select("*").withColumn("id", monotonically_increasing_id())

In [None]:
# coba1.printSchema()

In [None]:
# bkm = BisectingKMeans().setK(3).setSeed(1)

In [None]:
# model = bkm.fit(coba1)

In [None]:
# DataFinal.printSchema()

In [None]:
# DataFinal.show(5)

In [None]:
#sc.stop()