In [0]:
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext
from pyspark.sql.functions import concat, col, hour, minute, lpad, rpad, substring, year, month, dayofmonth, lit, to_timestamp, expr,split,explode,split
from pyspark.sql.functions import isnan, when, count, col,isnull
from pyspark.mllib import *

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import numpy as np
#spark = SparkSession.builder.getOrCreate()

sqlContext = SQLContext(sc)

In [0]:
df = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/team20SSDK/final_datasets/data_range/")



### The Dataset is hugely imbalanced, We'll balance it with class weighings

In [0]:
dataset_size = df.count()
print(f'dataset_size = {dataset_size}')

In [0]:
num_delayed = df.filter(df['DEP_DEL15'] == 1).count()
print(f'num_delayed = {num_delayed}')


num_not_delayed = df.filter(df['DEP_DEL15'] == 0).count()
print(f'num_not_delayed = {num_not_delayed}')

In [0]:

BalancingRatio= num_not_delayed /dataset_size
print('BalancingRatio = {}'.format(BalancingRatio))

In [0]:
#Adding the column in the data frame
df =df.withColumn("classWeights", when(df.DEP_DEL15 == 1,BalancingRatio).otherwise(1-BalancingRatio))
df.select("classWeights").show(5)

In [0]:
#Scaling the variables because WND and Temp measurements are scaled by 10.
df = df.withColumn('AVG_WND_SPEED_ORIGIN', df.AVG_WND_SPEED_ORIGIN/10)
df = df.withColumn('AVG_TMP_DEG_ORIGIN', df.AVG_TMP_DEG_ORIGIN/10)
df = df.withColumn('AVG_DEW_DEG_ORIGIN', df.AVG_DEW_DEG_ORIGIN/10)
df = df.withColumn('AVG_WND_SPEED_DEST', df.AVG_WND_SPEED_DEST/10)



In [0]:
#Restricing the dataset to 2015 Q1 and to Origin to ORD and ATL
df_Q1_15 = df.filter(((df['ORIGIN'] =='ORD') | (df['ORIGIN'] =='ATL')) & (df['QUARTER'] == 1) & (df['YEAR'] == 2015))

In [0]:
DROPPED = ['ORIGIN_AIRPORT_SEQ_ID','ORIGIN_CITY_MARKET_ID','ORIGIN_STATE_ABR', 'ORIGIN_STATE_FIPS', 'ORIGIN_STATE_NM', 'ORIGIN_WAC', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID',
 'DEST_CITY_MARKET_ID','ORIGIN_CITY_NAME','DEST_CITY_NAME', 'DEST_STATE_ABR', 'DEST_STATE_FIPS', 'DEST_STATE_NM', 'DEST_WAC','TAXI_IN',  'CANCELLED', 'DIVERTED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME','origin_max_date', 'dest_max_date','OP_CARRIER','ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_ID','CRS_DEP_TIME','WHEELS_ON',  'FLIGHTS','DIV_AIRPORT_LANDINGS','ORIGIN_TZ','DEST_TZ','DEST_STATION', 'DEST_STATION_NAME', 'ORIGIN_UTC_ADJ','TAIL_NUM','ORIGIN_TS', 'DEST_TS', 'DEST_UTC', 'ORIGIN_STATION', 'ORIGIN_STATION_NAME','OP_CARRIER_FL_NUM','OP_UNIQUE_CARRIER','FL_DATE']

cat_cols = ['DAY_OF_MONTH', 'DAY_OF_WEEK','OP_CARRIER_AIRLINE_ID', 'ORIGIN', 'DEST', 'DEP_TIME_BLK',  'DISTANCE_GROUP', 'MONTH']

num_cols = [ 'TAXI_OUT','AVG_WND_SPEED_ORIGIN', 'MIN_CIG_HEIGHT_ORIGIN', 'MIN_VIS_DIS_ORIGIN', 'AVG_TMP_DEG_ORIGIN', 'AVG_DEW_DEG_ORIGIN', 'AVG_SLP_ORIGIN', 'AVG_WND_SPEED_DEST', 'MIN_CIG_HEIGHT_DEST','DISTANCE','MIN_VIS_DIS_DEST','AVG_TMP_DEG_DEST', 'AVG_DEW_DEG_DEST', 'AVG_SLP_DEST' ,'WHEELS_OFF','PAGERANK','ORIGIN_FLIGHT_COUNT', 'DEST_FLIGHT_COUNT','DEP_MIN', 'DEP_HOUR', 'ARR_MIN', 'ARR_HOUR']

weights = 'classWeights'

label = 'DEP_DEL15','DEP_DELAY_GROUP'

### One hot encoder

In [0]:
#creating a truncated dataframe 
trunc_df = df_Q1_15[['DAY_OF_MONTH', 'DAY_OF_WEEK','OP_CARRIER_AIRLINE_ID', 'ORIGIN', 'DEST', 'DEP_TIME_BLK',  'DISTANCE_GROUP', 'MONTH','TAXI_OUT','AVG_WND_SPEED_ORIGIN', 'MIN_CIG_HEIGHT_ORIGIN', 'MIN_VIS_DIS_ORIGIN', 'AVG_TMP_DEG_ORIGIN', 'AVG_DEW_DEG_ORIGIN', 'AVG_SLP_ORIGIN', 'AVG_WND_SPEED_DEST', 'MIN_CIG_HEIGHT_DEST','DISTANCE','MIN_VIS_DIS_DEST','AVG_TMP_DEG_DEST', 'AVG_DEW_DEG_DEST', 'AVG_SLP_DEST' ,'WHEELS_OFF',
 'DEP_DELAY_GROUP','PAGERANK','ORIGIN_FLIGHT_COUNT', 'DEST_FLIGHT_COUNT','DEP_MIN', 'DEP_HOUR', 'ARR_MIN', 'ARR_HOUR','classWeights','DEP_DEL15','ORIGIN_UTC']]




In [0]:
from pyspark.ml.feature import StringIndexer,OneHotEncoder
cat_cols_indexed = [x+"_string_indexer" for x in cat_cols]
#print(cat_cols_indexed)
for i in range(0,len(cat_cols)):
  cat_cols_indexed[i] = StringIndexer(inputCol = cat_cols[i] , outputCol= cat_cols[i] +"_StringIndexer",handleInvalid='skip')
  

In [0]:
for i in range(0,len(cat_cols_indexed)):
  trunc_df = cat_cols_indexed[i].fit(trunc_df).transform(trunc_df)

In [0]:
#sending the output of String Indexer into OHE.

from pyspark.ml.feature import OneHotEncoder
cat_cols_one_hot = ['DAY_OF_MONTH_StringIndexer', 'DAY_OF_WEEK_StringIndexer', 'OP_CARRIER_AIRLINE_ID_StringIndexer', 'ORIGIN_StringIndexer', 'DEST_StringIndexer', 'DEP_TIME_BLK_StringIndexer', 'DISTANCE_GROUP_StringIndexer',  'MONTH_StringIndexer']
#print(cat_cols_indexed)
for i in range(0,len(cat_cols_one_hot)):
  cat_cols_one_hot[i] = OneHotEncoder(inputCol = cat_cols_one_hot[i] , outputCol= cat_cols_one_hot[i] +"_ohe")
  


In [0]:
#adding those OHE columnt to the truncated dataframe(with dropped columns)
for i in range(0,len(cat_cols_one_hot)):
  trunc_df = cat_cols_one_hot[i].fit(trunc_df).transform(trunc_df)

In [0]:
#preparing colums for input to vector assembler
encoded_cols = ['DAY_OF_MONTH_StringIndexer_ohe',
 'DAY_OF_WEEK_StringIndexer_ohe',
 'OP_CARRIER_AIRLINE_ID_StringIndexer_ohe',
 'ORIGIN_StringIndexer_ohe',
 'DEST_StringIndexer_ohe',
 'DEP_TIME_BLK_StringIndexer_ohe',
 'DISTANCE_GROUP_StringIndexer_ohe',
 'MONTH_StringIndexer_ohe']

assembler_cols = [num_cols.append(col) for col in encoded_cols]
assembler_cols = num_cols


In [0]:
print(assembler_cols)

In [0]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=assembler_cols,outputCol="features",handleInvalid='skip')

In [0]:
one_hot_encoded_trunc_df =assembler.transform(trunc_df)
one_hot_encoded_trunc_df.select("features").display(truncate=False)

features
"Map(vectorType -> sparse, length -> 278, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 28, 52, 58, 70, 171, 260, 269, 277), values -> List(16.0, 3.7227272727272727, 2286.0, 16093.0, -2.713636363636364, -13.513636363636362, 10293.454545454546, 2.7666666666666666, 2134.0, 240.0, 16093.0, 11.333333333333334, -88.0, 10295.42857142857, 2200.0, 0.05425050648176137, 1676.0, 13.0, 40.0, 21.0, 48.0, 21.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 278, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 39, 52, 59, 70, 183, 252, 267, 277), values -> List(13.0, 2.473684210526316, 91.0, 2012.0, 8.226315789473684, 5.1421052631578945, 10257.526315789473, 3.46, 1372.0, 646.0, 16093.0, 255.1, 205.5, 10212.8, 2020.0, 0.05425050648176137, 1914.0, 24.0, 10.0, 19.0, 21.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 278, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 36, 53, 58, 70, 183, 251, 267, 277), values -> List(12.0, 2.2199999999999998, 122.0, 4828.0, 1.3900000000000001, -1.23, 10221.7, 3.6, 91.0, 646.0, 4828.0, 154.72727272727272, 140.36363636363637, 10196.363636363636, 949.0, 0.05425050648176137, 2023.0, 16.0, 40.0, 9.0, 33.0, 11.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 278, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 36, 53, 58, 70, 183, 257, 267, 277), values -> List(15.0, 2.8133333333333335, 122.0, 805.0, 1.3, -1.6600000000000001, 10238.2, 3.190909090909091, 61.0, 646.0, 1609.0, 164.0909090909091, 141.36363636363637, 10206.90909090909, 1252.0, 0.05425050648176137, 2023.0, 16.0, 40.0, 12.0, 34.0, 14.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 278, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 33, 57, 58, 70, 183, 251, 267, 277), values -> List(13.0, 3.4944444444444445, 274.0, 8047.0, 4.411111111111111, 2.2444444444444445, 10151.333333333334, 2.2333333333333334, 2134.0, 646.0, 12875.0, 154.22222222222223, 142.77777777777777, 10197.333333333334, 948.0, 0.05425050648176137, 1510.0, 16.0, 40.0, 9.0, 33.0, 11.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 278, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 25, 52, 58, 70, 183, 257, 267), values -> List(20.0, 2.8, 2134.0, 11265.0, -8.947368421052632, -18.347368421052632, 10317.842105263158, 5.5, 22000.0, 646.0, 16093.0, 82.36363636363636, -54.45454545454545, 10294.90909090909, 1308.0, 0.05425050648176137, 2105.0, 16.0, 48.0, 12.0, 40.0, 14.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 278, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 50, 52, 58, 70, 83, 249, 267, 277), values -> List(16.0, 6.184615384615385, 823.0, 14484.0, 4.342307692307692, -4.819230769230769, 10288.807692307691, 4.257142857142857, 22000.0, 594.0, 16093.0, 208.42857142857142, 107.35714285714286, 10233.214285714286, 1352.0, 0.05425050648176137, 2057.0, 428.0, 40.0, 13.0, 36.0, 15.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 278, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 50, 52, 62, 83, 257, 272, 277), values -> List(12.0, 4.285185185185186, 762.0, 16093.0, -0.2814814814814815, -8.61111111111111, 10336.333333333334, 4.257142857142857, 22000.0, 1197.0, 16093.0, 208.42857142857142, 107.35714285714286, 10233.214285714286, 1213.0, 0.04521768069470927, 1609.0, 428.0, 5.0, 12.0, 13.0, 16.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 278, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 34, 56, 62, 83, 261, 272), values -> List(11.0, 7.058333333333333, 244.0, 2012.0, 1.6375, -4.1875, 10169.916666666666, 5.023076923076923, 975.0, 1197.0, 16093.0, 199.53846153846155, 98.3076923076923, 10154.384615384615, 1552.0, 0.04521768069470927, 1551.0, 407.0, 50.0, 14.0, 57.0, 18.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 278, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 34, 56, 58, 70, 83, 253, 267), values -> List(21.0, 2.670588235294118, 366.0, 11265.0, 5.08235294117647, -0.8411764705882353, 10191.470588235294, 5.023076923076923, 975.0, 594.0, 16093.0, 199.53846153846155, 98.3076923076923, 10154.384615384615, 1525.0, 0.05425050648176137, 1960.0, 407.0, 15.0, 55.0, 16.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"


In [0]:
one_hot_encoded_trunc_df.count()

#### Original dataset size was 24603731 and now we are down to 24603083 . We've lost 700 records as a result of 'skip' operation in vector assembler

### Separting 2019 data as test data
We only want 2015-2018 data for training and val.
In the cells below we split the dataset.

In [0]:
df_temp = one_hot_encoded_trunc_df.filter(df['YEAR']<2019)

In [0]:
df_test = one_hot_encoded_trunc_df.filter(df['YEAR']== 2019)

In [0]:
#Validating the total 
print(df_temp.count()+df_test.count())

### Let's do a train Val split- based on timeseries split.

In [0]:
#Renaiming the dependent variable to label
df_temp = df_temp.withColumnRenamed('DEP_DEL15', 'label')


In [0]:
from pyspark.sql import Window
df_temp_order = df_temp.withColumn("time_rank", f.percent_rank().over(Window.partitionBy().orderBy("ORIGIN_UTC")))
#display(df_temp_order)

In [0]:
train_df = df_temp_order.where("time_rank <= .8").drop("time_rank")
val_df = df_temp_order.where("time_rank > .8").drop("time_rank")

print("Train size: ", train_df.count())
print("Test size: ", val_df.count())


### Modeling

### Let's try to map features and coefficients
(reference)[https://stackoverflow.com/questions/42935914/how-to-map-features-from-the-output-of-a-vectorassembler-back-to-the-column-name]

### Let's now try to include the Interaction terms for weather

In [0]:
#New Data frame with interaction terms for ORIGIN weather
train_df_interaction =  train_df.withColumn('combined_weather_ORIGIN',train_df.AVG_WND_SPEED_ORIGIN*train_df.MIN_CIG_HEIGHT_ORIGIN*train_df.MIN_VIS_DIS_ORIGIN*train_df.AVG_TMP_DEG_ORIGIN*train_df.AVG_DEW_DEG_ORIGIN*train_df.AVG_SLP_ORIGIN)



In [0]:
#New Data frame with interaction terms for DEST weather
train_df_interaction =  train_df_interaction.withColumn('combined_weather_DEST',train_df.AVG_WND_SPEED_DEST*train_df.MIN_CIG_HEIGHT_DEST*train_df.MIN_VIS_DIS_DEST*train_df.AVG_TMP_DEG_DEST*train_df.AVG_DEW_DEG_DEST*train_df.AVG_SLP_DEST)




In [0]:
#New Data frame with interaction terms for ORIGIN weather

val_df_interaction =  val_df.withColumn('combined_weather_ORIGIN',train_df.AVG_WND_SPEED_ORIGIN*train_df.MIN_CIG_HEIGHT_ORIGIN*train_df.MIN_VIS_DIS_ORIGIN*train_df.AVG_TMP_DEG_ORIGIN*train_df.AVG_DEW_DEG_ORIGIN*train_df.AVG_SLP_ORIGIN)



In [0]:
#New Data frame with interaction terms for DEST weather

val_df_interaction =  val_df_interaction.withColumn('combined_weather_DEST',train_df.AVG_WND_SPEED_DEST*train_df.MIN_CIG_HEIGHT_DEST*train_df.MIN_VIS_DIS_DEST*train_df.AVG_TMP_DEG_DEST*train_df.AVG_DEW_DEG_DEST*train_df.AVG_SLP_DEST)



In [0]:
#New Data frame with interaction terms for ORIGIN weather


df_test_interaction =  df_test.withColumn('combined_weather_ORIGIN',train_df.AVG_WND_SPEED_ORIGIN*train_df.MIN_CIG_HEIGHT_ORIGIN*train_df.MIN_VIS_DIS_ORIGIN*train_df.AVG_TMP_DEG_ORIGIN*train_df.AVG_DEW_DEG_ORIGIN*train_df.AVG_SLP_ORIGIN)




In [0]:
#New Data frame with interaction terms for DEST weather

df_test_interaction =  df_test_interaction.withColumn('combined_weather_DEST',train_df.AVG_WND_SPEED_DEST*train_df.MIN_CIG_HEIGHT_DEST*train_df.MIN_VIS_DIS_DEST*train_df.AVG_TMP_DEG_DEST*train_df.AVG_DEW_DEG_DEST*train_df.AVG_SLP_DEST)

In [0]:
#Addiing the new cols to assembler
#preparing colums for input to vector assembler
encoded_cols = ['DAY_OF_MONTH_StringIndexer_ohe',
 'DAY_OF_WEEK_StringIndexer_ohe',
 'OP_CARRIER_AIRLINE_ID_StringIndexer_ohe',
 'ORIGIN_StringIndexer_ohe',
 'DEST_StringIndexer_ohe',
 'DEP_TIME_BLK_StringIndexer_ohe',
 'DISTANCE_GROUP_StringIndexer_ohe',
 
 'MONTH_StringIndexer_ohe']

num_cols.append('combined_weather_DEST')
num_cols.append('combined_weather_ORIGIN')

assembler_cols = [num_cols.append(col) for col in encoded_cols]
assembler_cols = num_cols



In [0]:
#Dropping the features column from train:
#df = df.drop("address", "phoneNumber")
train_df_interaction = train_df_interaction.drop('features')


In [0]:
# creating a new feature column with the new interaction term for training
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=assembler_cols,outputCol="features",handleInvalid='skip')

In [0]:
# creating a new feature column with the new interaction term for training
train_df_interaction =assembler.transform(train_df_interaction)
train_df_interaction.select("features").display(truncate=False)

features
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 59, 70, 110, 266, 267, 277, 278, 279, 307, 311, 317, 328, 368, 524, 525, 535, 536, 537, 565, 569, 575, 586, 626, 782, 783, 793), values -> List(9.0, 3.05, 1311.0, 402.0, -1.675, -6.5125, 10274.5, 2.909090909090909, 22000.0, 534.0, 16093.0, -27.0, -119.81818181818181, 10240.454545454546, 605.0, 0.05425050648176137, 1086.0, 195.0, 40.0, 5.0, 25.0, 7.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.412107526464E16, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.412107526464E16, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 49, 53, 67, 70, 240, 265, 267, 277, 278, 279, 307, 311, 325, 328, 498, 523, 525, 535, 536, 537, 565, 569, 583, 586, 756, 781, 783, 793), values -> List(11.0, 3.05, 1311.0, 402.0, -1.675, -6.5125, 10274.5, 3.56875, 22000.0, 701.0, 16093.0, -46.125, -116.0625, 10214.125, 630.0, 0.05425050648176137, 1086.0, 13.0, 6.0, 50.0, 7.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 6.908842406682604E16, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 6.908842406682604E16, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 49, 53, 65, 86, 266, 274, 277, 278, 279, 307, 311, 323, 344, 524, 532, 535, 536, 537, 565, 569, 581, 602, 782, 790, 793), values -> List(12.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 2.46, 610.0, 1440.0, 1609.0, -18.0, -46.4, 10144.8, 511.0, 0.04521768069470927, 936.0, 506.0, 5.0, 48.0, 7.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.045761312630118E13, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.045761312630118E13, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 61, 90, 266, 270, 277, 278, 279, 307, 311, 319, 348, 524, 528, 535, 536, 537, 565, 569, 577, 606, 782, 786, 793), values -> List(23.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 3.2846153846153845, 1280.0, 925.0, 12875.0, 28.615384615384617, -23.076923076923077, 10272.076923076924, 537.0, 0.04521768069470927, 936.0, 585.0, 10.0, 5.0, 5.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -3.671787026757887E14, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -3.671787026757887E14, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 66, 77, 266, 272, 277, 278, 279, 307, 311, 324, 335, 524, 530, 535, 536, 537, 565, 569, 582, 593, 782, 788, 793), values -> List(12.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 2.4833333333333334, 427.0, 1182.0, 805.0, 210.25, 200.08333333333334, 10210.166666666666, 538.0, 0.04521768069470927, 936.0, 333.0, 30.0, 5.0, 38.0, 9.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.666388848362216E14, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.666388848362216E14, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 61, 79, 266, 270, 277, 278, 279, 307, 311, 319, 337, 524, 528, 535, 536, 537, 565, 569, 577, 595, 782, 786, 793), values -> List(10.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 3.325, 22000.0, 888.0, 16093.0, -144.625, -203.5, 10234.875, 550.0, 0.04521768069470927, 936.0, 706.0, 33.0, 5.0, 20.0, 7.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.5460239963548358E17, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.5460239963548358E17, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 49, 53, 59, 81, 266, 269, 277, 278, 279, 307, 311, 317, 339, 524, 527, 535, 536, 537, 565, 569, 575, 597, 782, 785, 793), values -> List(18.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 6.270588235294118, 914.0, 235.0, 16093.0, -71.82352941176471, -139.41176470588235, 10199.35294117647, 608.0, 0.04521768069470927, 936.0, 340.0, 35.0, 5.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 9.419552651970484E15, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 9.419552651970484E15, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 69, 84, 266, 270, 277, 278, 279, 307, 311, 327, 342, 524, 528, 535, 536, 537, 565, 569, 585, 600, 782, 786, 793), values -> List(16.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 3.9733333333333336, 1372.0, 867.0, 16093.0, -59.4, -140.46666666666667, 10170.866666666667, 545.0, 0.04521768069470927, 936.0, 345.0, 40.0, 5.0, 56.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 7.444984637038171E15, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 7.444984637038171E15, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 58, 70, 71, 265, 270, 277, 278, 279, 307, 311, 316, 328, 329, 523, 528, 535, 536, 537, 565, 569, 574, 586, 587, 781, 786, 793), values -> List(31.0, 3.05, 1311.0, 402.0, -1.675, -6.5125, 10274.5, 4.911111111111111, 22000.0, 762.0, 16093.0, -43.22222222222222, -140.0, 10194.777777777777, 715.0, 0.05425050648176137, 1086.0, 357.0, 45.0, 6.0, 53.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.07263593691322656E17, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.07263593691322656E17, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 64, 70, 90, 265, 267, 277, 278, 279, 307, 311, 322, 328, 348, 523, 525, 535, 536, 537, 565, 569, 580, 586, 606, 781, 783, 793), values -> List(16.0, 3.05, 1311.0, 402.0, -1.675, -6.5125, 10274.5, 3.2846153846153845, 1280.0, 689.0, 12875.0, 28.615384615384617, -23.076923076923077, 10272.076923076924, 658.0, 0.05425050648176137, 1086.0, 585.0, 46.0, 6.0, 2.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -3.671787026757887E14, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -3.671787026757887E14, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"


## Standardization 0mean and 1 std. Deviation

In [0]:
from pyspark.ml.feature import StandardScaler
standardscaler=StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
std_train_df_interaction=standardscaler.fit(train_df_interaction).transform(train_df_interaction)
std_train_df_interaction.select("features","Scaled_features").display(truncate=False)

features,Scaled_features
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 59, 70, 110, 266, 267, 277, 278, 279, 307, 311, 317, 328, 368, 524, 525, 535, 536, 537, 565, 569, 575, 586, 626, 782, 783, 793), values -> List(9.0, 3.05, 1311.0, 402.0, -1.675, -6.5125, 10274.5, 2.909090909090909, 22000.0, 534.0, 16093.0, -27.0, -119.81818181818181, 10240.454545454546, 605.0, 0.05425050648176137, 1086.0, 195.0, 40.0, 5.0, 25.0, 7.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.412107526464E16, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.412107526464E16, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 59, 70, 110, 266, 267, 277, 278, 279, 307, 311, 317, 328, 368, 524, 525, 535, 536, 537, 565, 569, 575, 586, 626, 782, 783, 793), values -> List(0.9539607807419603, 2.259075462492894, 0.3128257985367199, 0.06927045044271123, -0.22736663478943794, -0.8258656487846749, 152.82210927162782, 1.8932184911846837, 2.864564299737446, 1.1309174297011437, 2.602939022366929, -0.27850024197014905, -1.208223643361644, 144.06343583665114, 1.3032316529815937, 12.1202686786039, 3.8235856912207433, 0.49141171951423257, 2.148369790254341, 1.1108788022032428, 1.40397113906661, 1.5159905573843333, 5.70293617365383, 2.749137850565331, 2.7044593874718936, 2.0180507531833034, 10.915680378973676, 13.421329154552046, 2.137130912691617, 2.0305098409062485, 0.41610645692273324, 6.759606793676851E-4, 5.70293617365383, 2.749137850565331, 2.7044593874718936, 2.0180507531833034, 10.915680378973676, 13.421329154552046, 2.137130912691617, 2.0305098409062485, 0.41610645692273324, 6.759606793676851E-4, 5.70293617365383, 2.749137850565331, 2.7044593874718936, 2.0180507531833034, 10.915680378973676, 13.421329154552046, 2.137130912691617, 2.0305098409062485))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 49, 53, 65, 86, 266, 274, 277, 278, 279, 307, 311, 323, 344, 524, 532, 535, 536, 537, 565, 569, 581, 602, 782, 790, 793), values -> List(12.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 2.46, 610.0, 1440.0, 1609.0, -18.0, -46.4, 10144.8, 511.0, 0.04521768069470927, 936.0, 506.0, 5.0, 48.0, 7.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.045761312630118E13, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.045761312630118E13, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 49, 53, 65, 86, 266, 274, 277, 278, 279, 307, 311, 323, 344, 524, 532, 535, 536, 537, 565, 569, 581, 602, 782, 790, 793), values -> List(1.2719477076559471, 3.651280437860403, 0.03626965780135882, 0.5546805472017101, -0.8496398292850196, -1.392118167625095, 151.8445620794512, 1.600952886608048, 0.07942655558362918, 3.0496649789693766, 0.2602453791703466, -0.18566682798009937, -0.46788873108633017, 142.71776095372402, 1.1007460738406518, 10.102217925420614, 3.2954661206101434, 1.2751504106369316, 1.1108788022032428, 2.6956245870078916, 1.5159905573843333, 5.70293617365383, 2.749137850565331, 6.903269987599399, 8.235983385733114, 13.421329154552046, 7.650333490832459, 2.0305098409062485, 2.494805585421517E-4, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 6.903269987599399, 8.235983385733114, 13.421329154552046, 7.650333490832459, 2.0305098409062485, 2.494805585421517E-4, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 6.903269987599399, 8.235983385733114, 13.421329154552046, 7.650333490832459, 2.0305098409062485))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 49, 53, 67, 70, 240, 265, 267, 277, 278, 279, 307, 311, 325, 328, 498, 523, 525, 535, 536, 537, 565, 569, 583, 586, 756, 781, 783, 793), values -> List(11.0, 3.05, 1311.0, 402.0, -1.675, -6.5125, 10274.5, 3.56875, 22000.0, 701.0, 16093.0, -46.125, -116.0625, 10214.125, 630.0, 0.05425050648176137, 1086.0, 13.0, 6.0, 50.0, 7.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 6.908842406682604E16, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 6.908842406682604E16, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 49, 53, 67, 70, 240, 265, 267, 277, 278, 279, 307, 311, 325, 328, 498, 523, 525, 535, 536, 537, 565, 569, 583, 586, 756, 781, 783, 793), values -> List(1.1659520653512847, 2.259075462492894, 0.3128257985367199, 0.06927045044271123, -0.22736663478943794, -0.8258656487846749, 152.82210927162782, 2.322520574830273, 2.864564299737446, 1.484593854345509, 2.602939022366929, -0.47577124669900467, -1.1703520657695516, 143.69302993666278, 1.357084200625461, 12.1202686786039, 3.8235856912207433, 0.03276078130094884, 1.3330545626438912, 2.80794227813322, 1.5159905573843333, 5.70293617365383, 2.749137850565331, 10.412592732968454, 2.0180507531833034, 43.982145021518846, 7.068138215659998, 2.137130912691617, 2.0305098409062485, 0.8425332182486714, 6.759606793676851E-4, 5.70293617365383, 2.749137850565331, 10.412592732968454, 2.0180507531833034, 43.982145021518846, 7.068138215659998, 2.137130912691617, 2.0305098409062485, 0.8425332182486714, 6.759606793676851E-4, 5.70293617365383, 2.749137850565331, 10.412592732968454, 2.0180507531833034, 43.982145021518846, 7.068138215659998, 2.137130912691617, 2.0305098409062485))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 61, 90, 266, 270, 277, 278, 279, 307, 311, 319, 348, 524, 528, 535, 536, 537, 565, 569, 577, 606, 782, 786, 793), values -> List(23.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 3.2846153846153845, 1280.0, 925.0, 12875.0, 28.615384615384617, -23.076923076923077, 10272.076923076924, 537.0, 0.04521768069470927, 936.0, 585.0, 10.0, 5.0, 5.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -3.671787026757887E14, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -3.671787026757887E14, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 61, 90, 266, 270, 277, 278, 279, 307, 311, 319, 348, 524, 528, 535, 536, 537, 565, 569, 577, 606, 782, 786, 793), values -> List(2.437899773007232, 3.651280437860403, 0.03626965780135882, 0.5546805472017101, -0.8496398292850196, -1.392118167625095, 151.8445620794512, 2.137607512763091, 0.1666655592574514, 1.9589861844074121, 2.0824482640262354, 0.29516264960938876, -0.23270328137582733, 144.5083016723797, 1.1567527233902741, 10.102217925420614, 3.2954661206101434, 1.4742351585426976, 0.5370924475635852, 1.1108788022032428, 0.28079422781332203, 1.7325606370106668, 5.70293617365383, 2.749137850565331, 3.464784322156284, 8.788476274787289, 13.421329154552046, 3.1090430601482804, 2.0305098409062485, -0.004477743677270369, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 3.464784322156284, 8.788476274787289, 13.421329154552046, 3.1090430601482804, 2.0305098409062485, -0.004477743677270369, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 3.464784322156284, 8.788476274787289, 13.421329154552046, 3.1090430601482804, 2.0305098409062485))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 66, 77, 266, 272, 277, 278, 279, 307, 311, 324, 335, 524, 530, 535, 536, 537, 565, 569, 582, 593, 782, 788, 793), values -> List(12.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 2.4833333333333334, 427.0, 1182.0, 805.0, 210.25, 200.08333333333334, 10210.166666666666, 538.0, 0.04521768069470927, 936.0, 333.0, 30.0, 5.0, 38.0, 9.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.666388848362216E14, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.666388848362216E14, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 66, 77, 266, 272, 277, 278, 279, 307, 311, 324, 335, 524, 530, 535, 536, 537, 565, 569, 582, 593, 782, 788, 793), values -> List(1.2719477076559471, 3.651280437860403, 0.03626965780135882, 0.5546805472017101, -0.8496398292850196, -1.392118167625095, 151.8445620794512, 1.616138076589425, 0.05559858890854043, 2.5032666702373634, 0.13020356136241706, 2.1686916990453273, 2.0176020893288054, 143.63734382452236, 1.1589068252960288, 10.102217925420614, 3.2954661206101434, 0.8391800133243048, 1.6112773426907556, 1.1108788022032428, 2.1340361313812473, 1.9491307166370002, 5.70293617365383, 2.749137850565331, 7.244300183924154, 7.389669583904344, 13.421329154552046, 4.729305065494258, 2.0305098409062485, 0.004471160599601691, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 7.244300183924154, 7.389669583904344, 13.421329154552046, 4.729305065494258, 2.0305098409062485, 0.004471160599601691, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 7.244300183924154, 7.389669583904344, 13.421329154552046, 4.729305065494258, 2.0305098409062485))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 61, 79, 266, 270, 277, 278, 279, 307, 311, 319, 337, 524, 528, 535, 536, 537, 565, 569, 577, 595, 782, 786, 793), values -> List(10.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 3.325, 22000.0, 888.0, 16093.0, -144.625, -203.5, 10234.875, 550.0, 0.04521768069470927, 936.0, 706.0, 33.0, 5.0, 20.0, 7.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.5460239963548358E17, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.5460239963548358E17, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 61, 79, 266, 270, 277, 278, 279, 307, 311, 319, 337, 524, 528, 535, 536, 537, 565, 569, 577, 595, 782, 786, 793), values -> List(1.0599564230466225, 3.651280437860403, 0.03626965780135882, 0.5546805472017101, -0.8496398292850196, -1.392118167625095, 151.8445620794512, 2.163889572346244, 2.864564299737446, 1.8806267370311156, 2.602939022366929, -1.491781388701215, -2.052055102932504, 143.9849423981987, 1.1847560481650852, 10.102217925420614, 3.2954661206101434, 1.779162430651529, 1.7724050769598312, 1.1108788022032428, 1.1231769112532881, 1.5159905573843333, 5.70293617365383, 2.749137850565331, 3.464784322156284, 7.678881452887389, 13.421329154552046, 3.1090430601482804, 2.0305098409062485, 4.324375682308292, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 3.464784322156284, 7.678881452887389, 13.421329154552046, 3.1090430601482804, 2.0305098409062485, 4.324375682308292, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 3.464784322156284, 7.678881452887389, 13.421329154552046, 3.1090430601482804, 2.0305098409062485))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 49, 53, 59, 81, 266, 269, 277, 278, 279, 307, 311, 317, 339, 524, 527, 535, 536, 537, 565, 569, 575, 597, 782, 785, 793), values -> List(18.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 6.270588235294118, 914.0, 235.0, 16093.0, -71.82352941176471, -139.41176470588235, 10199.35294117647, 608.0, 0.04521768069470927, 936.0, 340.0, 35.0, 5.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 9.419552651970484E15, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 9.419552651970484E15, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 49, 53, 59, 81, 266, 269, 277, 278, 279, 307, 311, 317, 339, 524, 527, 535, 536, 537, 565, 569, 575, 597, 782, 785, 793), values -> List(1.9079215614839207, 3.651280437860403, 0.03626965780135882, 0.5546805472017101, -0.8496398292850196, -1.392118167625095, 151.8445620794512, 4.080860299196985, 0.11900962590727389, 0.49768838198458576, 2.602939022366929, -0.7408470489009849, -1.4058015880763215, 143.4852155726565, 1.3096939586988579, 10.102217925420614, 3.2954661206101434, 0.8568204340248157, 1.8798235664725482, 1.1108788022032428, 1.7325606370106668, 5.70293617365383, 2.749137850565331, 2.7044593874718936, 7.762816758908242, 13.421329154552046, 2.762256517667515, 2.0305098409062485, 0.11487142915072279, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 2.7044593874718936, 7.762816758908242, 13.421329154552046, 2.762256517667515, 2.0305098409062485, 0.11487142915072279, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 2.7044593874718936, 7.762816758908242, 13.421329154552046, 2.762256517667515, 2.0305098409062485))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 69, 84, 266, 270, 277, 278, 279, 307, 311, 327, 342, 524, 528, 535, 536, 537, 565, 569, 585, 600, 782, 786, 793), values -> List(16.0, 4.92962962962963, 152.0, 3219.0, -6.2592592592592595, -10.977777777777778, 10208.777777777777, 3.9733333333333336, 1372.0, 867.0, 16093.0, -59.4, -140.46666666666667, 10170.866666666667, 545.0, 0.04521768069470927, 936.0, 345.0, 40.0, 5.0, 56.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 7.444984637038171E15, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 7.444984637038171E15, 1.6919595725316575E12, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 69, 84, 266, 270, 277, 278, 279, 307, 311, 327, 342, 524, 528, 535, 536, 537, 565, 569, 585, 600, 782, 786, 793), values -> List(1.695930276874596, 3.651280437860403, 0.03626965780135882, 0.5546805472017101, -0.8496398292850196, -1.392118167625095, 151.8445620794512, 2.5858209225430806, 0.17864464632908073, 1.8361524560878122, 2.602939022366929, -0.6127005323343279, -1.416439017814508, 143.08446865640855, 1.1739855386363116, 10.102217925420614, 3.2954661206101434, 0.8694207345251806, 2.148369790254341, 1.1108788022032428, 3.1448953515092066, 1.7325606370106668, 5.70293617365383, 2.749137850565331, 20.04195911942077, 7.987663876263702, 13.421329154552046, 3.1090430601482804, 2.0305098409062485, 0.09079157544523589, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 20.04195911942077, 7.987663876263702, 13.421329154552046, 3.1090430601482804, 2.0305098409062485, 0.09079157544523589, 0.006348331360758064, 5.70293617365383, 2.749137850565331, 20.04195911942077, 7.987663876263702, 13.421329154552046, 3.1090430601482804, 2.0305098409062485))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 58, 70, 71, 265, 270, 277, 278, 279, 307, 311, 316, 328, 329, 523, 528, 535, 536, 537, 565, 569, 574, 586, 587, 781, 786, 793), values -> List(31.0, 3.05, 1311.0, 402.0, -1.675, -6.5125, 10274.5, 4.911111111111111, 22000.0, 762.0, 16093.0, -43.22222222222222, -140.0, 10194.777777777777, 715.0, 0.05425050648176137, 1086.0, 357.0, 45.0, 6.0, 53.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.07263593691322656E17, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.07263593691322656E17, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 58, 70, 71, 265, 270, 277, 278, 279, 307, 311, 316, 328, 329, 523, 528, 535, 536, 537, 565, 569, 574, 586, 587, 781, 786, 793), values -> List(3.2858649114445297, 2.259075462492894, 0.3128257985367199, 0.06927045044271123, -0.22736663478943794, -0.8258656487846749, 152.82210927162782, 3.1961209389374763, 2.864564299737446, 1.6137810513712951, 2.602939022366929, -0.44582960545838674, -1.4117332403466858, 143.42085185170993, 1.5401828626146108, 12.1202686786039, 3.8235856912207433, 0.8996614557260565, 2.4169160140361337, 1.3330545626438912, 2.9764188148212134, 1.7325606370106668, 5.70293617365383, 2.749137850565331, 2.0646963630890025, 2.0180507531833034, 6.025328158839066, 7.068138215659998, 3.1090430601482804, 2.0305098409062485, 1.3080793492445884, 6.759606793676851E-4, 5.70293617365383, 2.749137850565331, 2.0646963630890025, 2.0180507531833034, 6.025328158839066, 7.068138215659998, 3.1090430601482804, 2.0305098409062485, 1.3080793492445884, 6.759606793676851E-4, 5.70293617365383, 2.749137850565331, 2.0646963630890025, 2.0180507531833034, 6.025328158839066, 7.068138215659998, 3.1090430601482804, 2.0305098409062485))"
"Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 64, 70, 90, 265, 267, 277, 278, 279, 307, 311, 322, 328, 348, 523, 525, 535, 536, 537, 565, 569, 580, 586, 606, 781, 783, 793), values -> List(16.0, 3.05, 1311.0, 402.0, -1.675, -6.5125, 10274.5, 3.2846153846153845, 1280.0, 689.0, 12875.0, 28.615384615384617, -23.076923076923077, 10272.076923076924, 658.0, 0.05425050648176137, 1086.0, 585.0, 46.0, 6.0, 2.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -3.671787026757887E14, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -3.671787026757887E14, 1.801572849805664E11, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 794, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 49, 53, 64, 70, 90, 265, 267, 277, 278, 279, 307, 311, 322, 328, 348, 523, 525, 535, 536, 537, 565, 569, 580, 586, 606, 781, 783, 793), values -> List(1.695930276874596, 2.259075462492894, 0.3128257985367199, 0.06927045044271123, -0.22736663478943794, -0.8258656487846749, 152.82210927162782, 2.137607512763091, 0.1666655592574514, 1.4591799795207643, 2.0824482640262354, 0.29516264960938876, -0.23270328137582733, 144.5083016723797, 1.4173990539865928, 12.1202686786039, 3.8235856912207433, 1.4742351585426976, 2.470625258792492, 1.3330545626438912, 0.11231769112532881, 1.7325606370106668, 5.70293617365383, 2.749137850565331, 4.070124964504702, 2.0180507531833034, 8.788476274787289, 7.068138215659998, 2.137130912691617, 2.0305098409062485, -0.004477743677270369, 6.759606793676851E-4, 5.70293617365383, 2.749137850565331, 4.070124964504702, 2.0180507531833034, 8.788476274787289, 7.068138215659998, 2.137130912691617, 2.0305098409062485, -0.004477743677270369, 6.759606793676851E-4, 5.70293617365383, 2.749137850565331, 4.070124964504702, 2.0180507531833034, 8.788476274787289, 7.068138215659998, 2.137130912691617, 2.0305098409062485))"


In [0]:
#Dropping the 'features' column
val_df_interaction = val_df_interaction.drop('features')


In [0]:
# creating a new feature column with the new interaction term for VAl
val_df_interaction =assembler.transform(val_df_interaction)

In [0]:
#Standardizing the validation set
standardscaler=StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
std_val_df_interaction=standardscaler.fit(val_df_interaction).transform(val_df_interaction)
#std_val_df_interaction.select("features","Scaled_features").display(truncate=False)

### Modeling

In [0]:
from pyspark.ml.classification import LogisticRegression
# lr = LogisticRegression().setWeightCol("classWeights").setLabelCol("Outcome").setFeaturesCol("Aspect")
#lr = LogisticRegression(labelCol="DEP_DEL15", featuresCol="features",weightCol="classWeights",maxIter=10)
lr = LogisticRegression(labelCol="label", featuresCol="Scaled_features",weightCol="classWeights",maxIter=10)
model=lr.fit(std_train_df_interaction)
predict_train=model.transform(std_train_df_interaction)

predict_test=model.transform(std_val_df_interaction)
predict_test.select("label","prediction").show(10)
from sklearn.metrics import confusion_matrix


trainScoreAndLabels = predict_train.select(['probability','label', f.col("prediction").alias("raw")])
valScoreAndLabels = predict_test.select(['probability','label', f.col("prediction").alias("raw")])


# To print confusion metrics
trainScoreAndLabels_pd = trainScoreAndLabels.toPandas()
valScoreAndLabels_pd = valScoreAndLabels.toPandas()
    
y_train_true = trainScoreAndLabels_pd["label"]
y_train_pred = trainScoreAndLabels_pd["raw"]
conf_mat_train = confusion_matrix(y_train_true, y_train_pred)
    
    
    
y_val_true = valScoreAndLabels_pd["label"]
y_val_pred = valScoreAndLabels_pd["raw"]
conf_mat_val = confusion_matrix(y_val_true, y_val_pred)
    
print("Accuracy Score: ", accuracy_score(y_val_true, y_val_pred))
print("F1 Score: ", f1_score(y_val_true, y_val_pred))
print(classification_report(y_val_true, y_val_pred))
