In [6]:
import findspark
findspark.init()

In [7]:
import pyspark
from pyspark.sql import SparkSession

In [8]:
sp=SparkSession.builder.appName('toddlerapp').getOrCreate()

In [9]:
dtitanic=sp.read.csv('d:\\titanic.csv',header=True,inferSchema=True)
dtitanic.toPandas()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings_Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


In [10]:
dtitanic.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Siblings_Spouses Aboard: integer (nullable = true)
 |-- Parents/Children Aboard: integer (nullable = true)
 |-- Fare: double (nullable = true)



In [11]:
dtitanic.groupBy("Survived").count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  545|
+--------+-----+



In [12]:
input_columns=dtitanic.columns
input_columns=input_columns[1:4:2]
dependent_var='Survived'
print(input_columns)
print(dependent_var)

['Pclass', 'Sex']
Survived


In [13]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
renamed=dtitanic.withColumn("label_str",dtitanic[dependent_var].cast('string'))
indexer=StringIndexer(inputCol='label_str',outputCol="label")
indexed=indexer.fit(renamed).transform(renamed)

In [14]:
numeric_inputs=[]
string_inputs=[]
for column in input_columns:
    if str(indexed.schema[column].dataType)=='StringType':
        indexer=StringIndexer(inputCol=column,outputCol=column+"_num")
        indexed=indexer.fit(indexed).transform(indexed)
        new_col_name=column+"_num"
        string_inputs.append(new_col_name)
    else:
        numeric_inputs.append(column)
print('numeric input',numeric_inputs)
print('String_inputs',string_inputs)

numeric input ['Pclass']
String_inputs ['Sex_num']


In [15]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.classification import NaiveBayes

In [16]:
minimums = dtitanic.select([min(c).alias(c) for c in dtitanic.columns if c in numeric_inputs])  
min_array= minimums.select(array(numeric_inputs).alias("mins")) 

dtitanic_minimum = min_array.select (array_min(min_array.mins)).collect() 
dtitanic_minimum= dtitanic_minimum[0][0]
if dtitanic_minimum < 0:
        print("WARNING: The Naive Bayes Classifier will not be able to process your dataframe as it contale")
else:
        print('no negative')

no negative


In [17]:
d={}
for col in numeric_inputs:
    d[col]=indexed.approxQuantile(col,[0.01,0.99],0.25)
for col in numeric_inputs:
    skew=indexed.agg(skewness(indexed[col])).collect()
    skew=skew[0][0]
    if skew > 1: # If right skew, floor, cap and log(x+1)
        indexed = indexed.withColumn(col, \
        log(when(dtitanic[col] < d[col][0],d[col][0])\
        .when(indexed[col] > d[col][1], d[col][1])\
        .otherwise(indexed[col] ) +1).alias(col))
        print(col+" has been treated for positive (right) skewness. (skew =)",skew,")")
    elif skew < -1: # If left skew floor, cap and exp(x)
        indexed = indexed.withColumn(col, \
        exp(when(dtitanic[col] < d[col][0],d[col][0])\
        .when(indexed[col] > d[col][1], d[col][1])\
        .otherwise(indexed[col] )).alias(col))
        print(col+" has been treated for negative (left) skewness. (skew =",skew,")")        
print(skew)  

-0.6223541098616062


In [18]:
features_list = numeric_inputs + string_inputs
assembler = VectorAssembler(inputCols=features_list,outputCol='features')
output = assembler.transform(indexed).select('features','label')
output.show(5,False)

+---------+-----+
|features |label|
+---------+-----+
|[3.0,0.0]|0.0  |
|[1.0,1.0]|1.0  |
|[3.0,1.0]|1.0  |
|[1.0,1.0]|1.0  |
|[3.0,0.0]|0.0  |
+---------+-----+
only showing top 5 rows



In [19]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures",min=0,max=1000)
print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
scalerModel = scaler.fit(output)
scaled_data = scalerModel.transform(output)
final_data = scaled_data.select('label','scaledFeatures')
final_data = final_data.withColumnRenamed("scaledFeatures","features")
final_data.show()

Features scaled to range: [0.000000, 1000.000000]
+-----+---------------+
|label|       features|
+-----+---------------+
|  0.0|   [1000.0,0.0]|
|  1.0|   [0.0,1000.0]|
|  1.0|[1000.0,1000.0]|
|  1.0|   [0.0,1000.0]|
|  0.0|   [1000.0,0.0]|
|  0.0|   [1000.0,0.0]|
|  0.0|      (2,[],[])|
|  0.0|   [1000.0,0.0]|
|  1.0|[1000.0,1000.0]|
|  1.0| [500.0,1000.0]|
|  1.0|[1000.0,1000.0]|
|  1.0|   [0.0,1000.0]|
|  0.0|   [1000.0,0.0]|
|  0.0|   [1000.0,0.0]|
|  0.0|[1000.0,1000.0]|
|  1.0| [500.0,1000.0]|
|  0.0|   [1000.0,0.0]|
|  1.0|    [500.0,0.0]|
|  0.0|[1000.0,1000.0]|
|  1.0|[1000.0,1000.0]|
+-----+---------------+
only showing top 20 rows



In [20]:
train,test = final_data.randomSplit([0.70,0.30])

In [21]:
nbclassifier=NaiveBayes()

In [22]:
nbcModel = nbclassifier.fit(train)

In [23]:
predictions = nbcModel.transform(test)

In [24]:
predictions.printSchema()
predictions.select('label','rawPrediction','probability','prediction').show()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

+-----+--------------------+--------------------+----------+
|label|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|[-0.4836890823309...|[0.61650485436893...|       0.0|
|  0.0|[-0.4836890823309...|[0.61650485436893...|       0.0|
|  0.0|[-0.4836890823309...|[0.61650485436893...|       0.0|
|  0.0|[-0.4836890823309...|[0.61650485436893...|       0.0|
|  0.0|[-0.4836890823309...|[0.61650485436893...|       0.0|
|  0.0|[-0.4836890823309...|[0.61650485436893...|       0.0|
|  0.0|[-0.4836890823309...|[0.61650485436893...|       0.0|
|  0.0|[-0.4836890823309...|[0.61650485436893...|       0.0|
|  0.0|[-0.4836890823309...|[0.61650485436893...|       0.0|
|  0.0|[-0.4836890823309...|[0.61650485436893...|    

In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator=BinaryClassificationEvaluator();
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Model :" , accuracy)
print("Test Error of Model :" , 1-accuracy)

Accuracy of Model : 0.48693539165237276
Test Error of Model : 0.5130646083476272
