In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import Row
from pyspark.sql.functions import col,pandas_udf, PandasUDFType,count
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
import nltk

#Install Tagged file from databricks table
df = spark.table(some_file)
df.cache()

In [2]:
cleardf=df.na.drop()
display(cleardf.groupby('label').count())

label,count
1,6367
0,29281


In [3]:
from pyspark.sql.types import IntegerType
alldf = cleardf.withColumn("label", cleardf["label"].cast(IntegerType()))

In [4]:
train_df, test_df = alldf.randomSplit([0.65, 0.35], seed = 2018)

print("Training Dataset Count: " + str(train_df.count()))
print("Test Dataset Count: " + str(test_df.count()))

In [5]:
train_df.cache()
test_df.cache()

In [6]:
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer,StopWordsRemover,VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from mleap.sklearn.preprocessing.data import FeatureExtractor, LabelEncoder, ReshapeArrayToN1
from pyspark.ml.evaluation import RegressionEvaluator,MulticlassClassificationEvaluator,BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import *

In [7]:
stopWordstr = StopWordsRemover.loadDefaultStopWords("turkish")

regexTokenizer = RegexTokenizer(inputCol="Text", outputCol="words", pattern=' |,|;|-|_|\*|\t|\!|\.|\*|\:|\(|\|\"|\&|\$|\|\#|\}|\]|\[|\)|\{|\/|\'|<|>',toLowercase=True)

remover = StopWordsRemover(inputCol="words", outputCol="filtered",stopWords =stopWordstr)

hashtf = HashingTF(inputCol="filtered", outputCol='tf')

idf = IDF(inputCol='tf', outputCol="tffeatures")

va = VectorAssembler(inputCols=["tf", "tffeatures"], outputCol="features") 

lr = LogisticRegression()

pipelinelr = Pipeline(stages=[stopWordstr,regexTokenizer,remover,hashtf, idf,va,lr])



In [8]:
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.006])
             .addGrid(idf.minDocFreq,[2])
             .addGrid(hashtf.numFeatures, [2**18])
             .addGrid(hashtf.binary, [True])
             .addGrid(lr.fitIntercept, [True])
             .addGrid(lr.standardization, [True])
             .addGrid(lr.elasticNetParam, [0.1])
             .addGrid(lr.aggregationDepth, [2])
             .addGrid(lr.maxIter,[1])
             .addGrid(lr.tol,[1e-06])
             .build()  )

cvlr = CrossValidator(estimator=pipelinelr, evaluator=MulticlassClassificationEvaluator(), estimatorParamMaps=paramGrid)

cvModel = cvlr.fit(train_df)
modellr = cvModel.bestModel

In [9]:
predictions = modellr.transform(test_df)
 

predictions = predictions.select(col("label").cast("Float"),col("prediction"))
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

 
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)
 
evaluatorf1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluatorf1.evaluate(predictions)
print("f1 = %g" % f1)
 
evaluatorwp = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
wp = evaluatorwp.evaluate(predictions)
print("weightedPrecision = %g" % wp)
 
evaluatorwr = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
wr = evaluatorwr.evaluate(predictions)
print("weightedRecall = %g" % wr)

In [10]:
%sh 
rm -rf /tmp/model_export
mkdir /tmp/model_export

In [11]:
import sys
sys.path.append('/opt/libs/mleap/python')

import mleap.pyspark
from mleap.pyspark.spark_support import SimpleSparkSerializer

        
modellr.serializeToBundle("jar:file:/tmp/model_export/LR_model-json.zip",predictionslr)