# Paper Helicopter Data

Original data can be found here: <https://www.kaggle.com/yonggijj/paper-helicopter-dataset>

Description can can be found here: <https://www.paperhelicopterexperiment.com/>

In [None]:
import sys
sys.path.append("..")
from pyspark.ml.feature import StringIndexer, VectorAssembler, IndexToString
from pyspark.sql.functions import col
from pyspark.sql.session import SparkSession
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df

## Input / Select the helicopter data file 

In [None]:
# Input data file
input_file = translate_to_file_string("../../data/paper_helicopter_data.csv")

## Create the Spark Session and DataFrame

In [None]:
# create a SparkSession
spark = (SparkSession
       .builder
       .appName("Helicopter Data")
       .getOrCreate())
# create a DataFrame using an infered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(input_file)  
df = df.drop("_c10")\
       .withColumn("FlightTimeSum",(col("Trial 1") + col("Trial 2")+col("Trial 3")))\
       .withColumn("Success", (col("FlightTimeSum")>=6.0).cast('int'))
# Print the DataFrame
print(df.printSchema())

In [None]:
# Count, mean, min, max of the data
print_df(df.summary())

 ### Feature vector

 labels: 

 - Success = Categorical
 - FlightTimeSum = continous value

In [None]:
feature_cols = ["Wing Length", "Body Length", "Body Width", "Paper Clips", "Folded Wing", "Taped Wing"]
assembler =  VectorAssembler(outputCol="features", inputCols=list(feature_cols))

In [None]:
labeled_point_df = assembler.transform(df)
print_df(labeled_point_df,10)

In [None]:

splits = labeled_point_df.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

In [None]:
spark.stop()