In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("log_reg_codealong").getOrCreate()

In [0]:
df = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/necmettinceylan@hotmail.com/titanic.csv", header=True,
                                  inferSchema=True)

df.show()

In [0]:
df.printSchema()

In [0]:
df.columns

In [0]:
 my_cols = df.select( ['Survived',
 'Pclass','Sex',
 'Age','SibSp',
 'Parch','Fare','Embarked']  ) 

In [0]:
my_final_data = my_cols.na.drop()

In [0]:
from pyspark.ml.feature import( VectorAssembler,VectorIndexer,
                              OneHotEncoder,StringIndexer)

In [0]:
gender_indexer = StringIndexer( inputCol="Sex", outputCol="SexIndex" )
gender_encoder = OneHotEncoder( inputCol="SexIndex", outputCol="SexVec" )

embark_indexer = StringIndexer( inputCol="Embarked", outputCol="EmbarkIndex" )
embark_encoder = OneHotEncoder( inputCol="EmbarkIndex", outputCol="EmbarkVec" )

In [0]:
assembler = VectorAssembler( inputCols=["Pclass","SexVec","EmbarkVec","Age","SibSp","Parch","Fare"], outputCol="features" )

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [0]:
log_reg_titanic = LogisticRegression(featuresCol="features",labelCol="Survived")




pipeline = Pipeline( stages=[ gender_indexer,embark_indexer,
                            gender_encoder,embark_encoder,
                            assembler,log_reg_titanic] )

In [0]:
train_data, test_data = my_final_data.randomSplit([0.7,0.3])

In [0]:
  fit_model = pipeline.fit(train_data)

In [0]:
results = fit_model.transform(test_data)


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
my_eval = BinaryClassificationEvaluator( rawPredictionCol="prediction", labelCol="Survived" )
my_eval.evaluate(results)