In [4]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.sql.types import *
#from user_definition import *

In [5]:

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

#if you have to create appName
# ss = SparkSession.builder.appName(app_name).getOrCreate()
# sc = ss.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/08 12:40:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/08 12:40:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/03/08 12:40:35 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/03/08 12:40:35 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/03/08 12:40:35 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/03/08 12:40:35 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
22/03/08 12:40:35 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
22/03/08 12:40:35 WARN Utils: Service 'SparkUI' could

In [6]:
def FloatSafe(value): # In case there are non-integer type to be converted.
    try:
        return float(value)
    except:
        return None
    
def IntSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except:
        return None

In [8]:
train_rdd = sc.textFile('train.csv', 4)\
            .map(lambda x:  x.split(",")).\
            map(lambda x : (FloatSafe(x[0]), FloatSafe(x[1]),FloatSafe(x[2]),IntSafe(x[3])  ) )

test_rdd = sc.textFile('test.csv', 4)\
            .map(lambda x:  x.split(",")).\
            map(lambda x : (FloatSafe(x[0]), FloatSafe(x[1]),FloatSafe(x[2]),IntSafe(x[3])  ) )

In [11]:
schema = StructType([
    StructField("perimeter",DoubleType(),True),
    StructField("length",DoubleType(),True),
    StructField("width",DoubleType(),True),
    StructField("label",IntegerType(),True)

])

In [12]:
df_train = ss.createDataFrame(train_rdd, schema)
df_test = ss.createDataFrame(test_rdd, schema)

In [13]:
df_train.printSchema()

root
 |-- perimeter: double (nullable = true)
 |-- length: double (nullable = true)
 |-- width: double (nullable = true)
 |-- label: integer (nullable = true)



# Vector Assembler

In [15]:
va = VectorAssembler(outputCol="features", inputCols=df_train.columns[0:-1]) #except the last col.
df_train_va = va.transform(df_train).select("features", "label")
df_test_va = va.transform(df_test).select("features", "label")

# Standard Scalar

In [16]:
std = StandardScaler(inputCol="features", outputCol="features_out", withMean=True, withStd=True).fit(df_train_va)
#df_train_std= std.fit_transform(df_train_va)
df_test_std=std.transform(df_test_va).select('features_out','label').withColumnRenamed('features_out', 'features')
df_train_std=std.transform(df_train_va).select('features_out','label').withColumnRenamed('features_out', 'features')

                                                                                

# String Indexer

In [None]:
# 
c = "label"
si = StringIndexer(inputCol=c, outputCol=c+"-num") 
sm = si.fit(df_va) 

newdf = sm.transform(df_va).drop(c) 
newdf = newdf.withColumnRenamed(c+"-num", c)  

newdf_test = sm.transform(df_va_test).drop(c) 
newdf_test = newdf_test.withColumnRenamed(c+"-num", c)  


# Building Random Forest Model

In [19]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier()

# Pipeline Building with Decision Tree

In [None]:
# Fit the pipeline to training documents.
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[va,dt])

# Cross Validation

In [20]:
evaluator = MulticlassClassificationEvaluator()#.metrics.setMetricName() 
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [2,3,4,5]).build()
cv = CrossValidator(estimator=rf, 
                    evaluator=evaluator, 
                    numFolds=2, 
                    estimatorParamMaps=paramGrid)

cvmodel = cv.fit(df_train_std)

                                                                                

# Prediction using Best Model 

In [22]:
dtpredicts = cvmodel.bestModel.transform(df_test_std)
metrics = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")


#metrics.setMetricName("f1") 

metrics.evaluate(dtpredicts)

0.8508974358974359

# Getting the Best Parameter

In [24]:
cvmodel.bestModel.getMaxDepth()

2

# Plain RandomForest Without Cross Validation

In [26]:
rf = RandomForestClassifier(maxDepth=1,numTrees=2,maxBins=2,seed=6)
rfmodel = rf.fit(df_train_std)

In [29]:
rfpredicts = rfmodel.transform(df_test_std)
metric_name = "f1"
metrics = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
metrics.setMetricName(metric_name) 

metrics.evaluate(rfpredicts)

0.6825332562174667

# Spilit the data

In [30]:
pendtsets = df_train_std.randomSplit([0.8, 0.2], 1)
df_std_train = pendtsets[0].cache()
df_std_valid = pendtsets[1].cache()

In [34]:
rfpredicts.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[-1.1636812938097...|    3|[0.0,0.5965346534...|[0.0,0.2982673267...|       3.0|
|[-1.1019210378260...|    3|[0.0,0.5965346534...|[0.0,0.2982673267...|       3.0|
|[-1.1482412298138...|    3|[0.0,0.5965346534...|[0.0,0.2982673267...|       3.0|
|[-0.9089202378771...|    3|[0.0,0.5965346534...|[0.0,0.2982673267...|       3.0|
|[-1.0478808138403...|    3|[0.0,0.5965346534...|[0.0,0.2982673267...|       3.0|
|[-0.8008397899057...|    1|[0.0,0.5965346534...|[0.0,0.2982673267...|       3.0|
|[-0.8162798539016...|    3|[0.0,0.5965346534...|[0.0,0.2982673267...|       3.0|
|[-1.0092806538505...|    3|[0.0,0.5965346534...|[0.0,0.2982673267...|       3.0|
|[-0.7004793739322...|    3|[0.0,0.5965346534...|[0.0,0.2982673267...|       3.0|
|[-0.90120020587

# MulticlassMetrics

In [44]:
from pyspark.mllib.evaluation import MulticlassMetrics

#prediction and label
prediction_label = rfpredicts.select("prediction", "label").rdd
prediction_label = prediction_label.map(lambda x: (x[0], float(x[1]) )) # Intermediate


metrics = MulticlassMetrics(prediction_label)

precision = metrics.precision(3.0)
recall = metrics.recall(3.0)
f1Score = metrics.fMeasure(3.0)
confusionMetrics = metrics.confusionMatrix()

print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)
print("Confusion Metrics = \n%s" % confusionMetrics)

Summary Stats
Precision = 0.625
Recall = 1.0
F1 Score = 0.7692307692307693
Confusion Metrics = 
DenseMatrix([[ 4.,  4.,  6.],
             [ 1., 15.,  0.],
             [ 0.,  0., 10.]])


# Building Kmeans Pipeline

In [None]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k = 10, maxIter= 200, tol=0.1)
model = kmeans.fit(df_train_std)

center = model.clusterCenters()
for c in center:
    print(c)
    
prediction = model.transform(penlpoints)

from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()
sihoutte = evaluator.evaluate(prediction)
sihoutte

# String Indexer

In [None]:
# converting strings to numeric values
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    # variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        # For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        
        # Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        # and then drops the original columns.
        # and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

dfnumeric = indexStringColumns(dfrawnona, ["workclass", "education",
                                           "marital_status", "occupation",
                                           "relationship", "race", "sex", 
                                           "native_country", "income"])

# One Hot Encoder

In [None]:
from pyspark.ml.feature import OneHotEncoder

def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)

        newdf = ohe_model.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["workclass", "education", 
                                        "marital_status", "occupation", 
                                        "relationship", "race", "native_country"])        

# Vector Assembler

In [None]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
input_cols=["age","capital_gain","capital_loss","fnlwgt","hours_per_week","sex","workclass",
            "education","marital_status","occupation","relationship","native_country","race"]

#VectorAssembler takes a number of collumn names(inputCols) and output column name (outputCol)
#and transforms a DataFrame to assemble the values in inputCols into one single vector with outputCol.
va = VectorAssembler(outputCol="features", inputCols=input_cols)
#lpoints - labeled data.
lpoints = va.transform(dfhot).select("features", "income").withColumnRenamed("income", "label")

# Logistic Regression

In [None]:
#Train the model.
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
#The above lines are same as..
#lr = LogisticRegression()
#lrmodel = lr.setParams(regParam=0.01, maxIter=1000, fitIntercept=True).fit(adulttrain)

# LR Parameters

In [None]:
#Interpret the model parameters
print(lrmodel.coefficients)
print(lrmodel.intercept)

validpredicts = lrmodel.transform(adultvalid)
validpredicts.show()


In [None]:
#Evaluate the model. default metric : Area Under ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(validpredicts)))

In [None]:
#Evaluate the model. metric : Area Under PR
bceval.setMetricName("areaUnderPR")
print (bceval.getMetricName() +":" + str(bceval.evaluate(validpredicts)))

# Cross Validation on LogisticRegression

In [None]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5)
#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000])\
.addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build()
#setEstimatorParamMaps() takes ParamGridBuilder().
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(adulttrain)

In [None]:
print(cvmodel.bestModel.coefficients)
print(cvmodel.bestModel.intercept)
print(cvmodel.bestModel.getMaxIter())
print(cvmodel.bestModel.getRegParam())

In [None]:
BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid))

In [None]:
BinaryClassificationEvaluator().setMetricName("areaUnderPR").evaluate(cvmodel.bestModel.transform(adultvalid))

In [None]:
BinaryClassificationEvaluator().setMetricName("areaUnderROC").evaluate(cvmodel.bestModel.transform(adultvalid))