In [1]:
import findspark
findspark.init()

In [2]:
# SparkSession 객체 생성
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.sql("select 'spark' as hello ")
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [6]:
titanic = spark.read\
           .option("inferSchema", "true")\
           .option("header", "true")\
           .csv("c:/SparkDG/data/titanic.csv")

In [7]:
titanic.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [8]:
titanic.toPandas().head

<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                     

In [9]:
titanic.dtypes

[('PassengerId', 'int'),
 ('Survived', 'int'),
 ('Pclass', 'int'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Ticket', 'string'),
 ('Fare', 'double'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

In [11]:
titanic.createOrReplaceTempView("tView")

In [12]:
titanicSQL = spark.sql("""
select Survived, Pclass, Sex, Age, Fare, Embarked
from tView
""")

In [13]:
# categorical column들을 수치로 변경
from pyspark.ml.feature import StringIndexer
titanicSQL = StringIndexer(
    inputCol='Sex', 
    outputCol='Gender', 
    handleInvalid='keep').fit(titanicSQL).transform(titanicSQL)

titanicSQL = StringIndexer(
    inputCol='Embarked', 
    outputCol='Boarded', 
    handleInvalid='keep').fit(titanicSQL).transform(titanicSQL)
titanicSQL.show()

+--------+------+------+----+-------+--------+------+-------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|Gender|Boarded|
+--------+------+------+----+-------+--------+------+-------+
|       0|     3|  male|22.0|   7.25|       S|   0.0|    0.0|
|       1|     1|female|38.0|71.2833|       C|   1.0|    1.0|
|       1|     3|female|26.0|  7.925|       S|   1.0|    0.0|
|       1|     1|female|35.0|   53.1|       S|   1.0|    0.0|
|       0|     3|  male|35.0|   8.05|       S|   0.0|    0.0|
|       0|     3|  male|null| 8.4583|       Q|   0.0|    2.0|
|       0|     1|  male|54.0|51.8625|       S|   0.0|    0.0|
|       0|     3|  male| 2.0| 21.075|       S|   0.0|    0.0|
|       1|     3|female|27.0|11.1333|       S|   1.0|    0.0|
|       1|     2|female|14.0|30.0708|       C|   1.0|    1.0|
|       1|     3|female| 4.0|   16.7|       S|   1.0|    0.0|
|       1|     1|female|58.0|  26.55|       S|   1.0|    0.0|
|       0|     3|  male|20.0|   8.05|       S|   0.0|    0.0|
|       

In [14]:
# 불필요한 column 제거
titanic = titanicSQL.drop('Sex').drop('Embarked')

titanic.show()

+--------+------+----+-------+------+-------+
|Survived|Pclass| Age|   Fare|Gender|Boarded|
+--------+------+----+-------+------+-------+
|       0|     3|22.0|   7.25|   0.0|    0.0|
|       1|     1|38.0|71.2833|   1.0|    1.0|
|       1|     3|26.0|  7.925|   1.0|    0.0|
|       1|     1|35.0|   53.1|   1.0|    0.0|
|       0|     3|35.0|   8.05|   0.0|    0.0|
|       0|     3|null| 8.4583|   0.0|    2.0|
|       0|     1|54.0|51.8625|   0.0|    0.0|
|       0|     3| 2.0| 21.075|   0.0|    0.0|
|       1|     3|27.0|11.1333|   1.0|    0.0|
|       1|     2|14.0|30.0708|   1.0|    1.0|
|       1|     3| 4.0|   16.7|   1.0|    0.0|
|       1|     1|58.0|  26.55|   1.0|    0.0|
|       0|     3|20.0|   8.05|   0.0|    0.0|
|       0|     3|39.0| 31.275|   0.0|    0.0|
|       0|     3|14.0| 7.8542|   1.0|    0.0|
|       1|     2|55.0|   16.0|   1.0|    0.0|
|       0|     3| 2.0| 29.125|   0.0|    2.0|
|       1|     2|null|   13.0|   0.0|    0.0|
|       0|     3|31.0|   18.0|   1

In [None]:
# Assemble all the features with VectorAssembler

required_features = ['Pclass',
                    'Age',
                    'Fare',
                    'Gender',
                    'Boarded'
                   ]

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=required_features, outputCol='features')

transformed_data = assembler.transform(dataset)


transformed_data.show()

In [None]:
# Split the data
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [None]:
# Define the model
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='Survived', 
                            featuresCol='features',
                            maxDepth=5)

In [None]:
# Fit the model
model = rf.fit(training_data)

In [None]:
# Predict with the test dataset
predictions = model.transform(test_data)

In [None]:

# Evaluate our model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol='Survived', 
    predictionCol='prediction', 
    metricName='accuracy')

In [None]:
# Accuracy
accuracy = evaluator.evaluate(predictions)
print('Test Accuracy = ', accuracy)