## 1. Preparation

#### 1.1 Spark preparation

In [0]:
# import necessary libaries
from pyspark.sql.functions import *

In [0]:
# Creating Spark Session
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("ML Model")
         .getOrCreate())

sc = spark.sparkContext

#### 1.2 Prepare a UDF (User Defined Function)

We need to create UDF to embed the ML model we trained in the previous workshop. This model will be used for Posts data sentiment analysis.

In [0]:
# User defined function
def predictions_udf(df, ml_model, stringindexer):
    from pyspark.sql.functions import col, regexp_replace, lower, trim
    from pyspark.ml import PipelineModel

    # Filter out empty body text
    df = df.filter("Body is not null")
    # Making sure the naming of the columns are consistent with the model
    df = df.select(col("Body").alias("text"), col("Tags"))
    # Preprocessing of the feature column
    cleaned = df.withColumn('text', regexp_replace('text', r"http\S+", "")) \
                    .withColumn('text', regexp_replace('text', r"[^a-zA-z]", " ")) \
                    .withColumn('text', regexp_replace('text', r"\s+", " ")) \
                    .withColumn('text', lower('text')) \
                    .withColumn('text', trim('text')) 

    # Load in the saved pipeline model
    model = PipelineModel.load(ml_model)

    # Making the prediction
    prediction = model.transform(df)

    predicted = prediction.select(col('text'), col('Tags'), col('prediction'))

    # Decoding the indexer
    from pyspark.ml.feature import StringIndexerModel, IndexToString

    # Load in the StringIndexer that was saved
    indexer = StringIndexerModel.load(stringindexer)

    # Initialize the IndexToString converter
    i2s = IndexToString(inputCol = 'prediction', outputCol = 'decoded', labels = indexer.labels)
    converted = i2s.transform(predicted)

    # Display the important columns
    return converted

#### 1.3 Load Posts files and ML model

If you can remember our last workshop for machine learning model training, our trained model was saved to `/mnt/deBDProject/model`. Yours name might be different.

In [0]:
display(dbutils.fs.ls("/mnt/deBDProject/model"))

path,name,size,modificationTime
dbfs:/mnt/deBDProject/model/metadata/,metadata/,0,1715542985000
dbfs:/mnt/deBDProject/model/stages/,stages/,0,1715542985000


Let's load the Posts files and the ml model

In [0]:
posts = spark.read.parquet("/mnt/deBDProject/Landing/Posts/*")
ml_model = "/mnt/deBDProject/model"
stringindexer = "/mnt/deBDProject/stringindexer"

#### 1.4 Run model to do `Sentiment Analysis`

In [0]:
# # Producing the sentiment analysis
result = predictions_udf(posts,ml_model, stringindexer)
display(result)

text,Tags,prediction,decoded
"""I've taken a look at your stringToHex method and it seems to be incorrect. Try this one instead: StringBuilder rep = new StringBuilder(); for (byte b : base.getBytes) { rep.append(Integer.toString((b & 0xff) + 0x100, 16).substring(1)); } System.out.println(rep); Also I found this TripleDes with Padding example; you could try with the algorithm and transformation the example uses. """,,0.0,c#
"I try to do a migration to Windows Server 2008 and i am new with script concept. I have a .sh file to launch which contains : cd RADIOROOT/PLAYERS killall player_1 su - robotstream --command='/RADIOROOT/PLAYERS/player_1 -t /RADIOROOT/PLAYERS/player_1.conf' & > dev/null First, how can i launch this .sh file on my IIS ? Do i have to change it in a powershell script ? I tried to change it to a .ps1 file with some changes but not working. Anyone know how to do it ? Then, do you have any website about changing this sort of script in order to make it work ? Thanks for your help.",,13.0,html
"final OnClickLisener listener = new OnClickListener() { public void onClick(View v){ switch(v.getId()){ case R.id.zero: break; case R.id.one: break; case R.id.two: break; } } } final int[] btnIds = new int[]{R.id.one, R.id.two, R.id.zero}; for(int i = 0; i < btnIds.length; i++) { final Button btn = (Button)findViewById(btnIds[i]); btn.setOnClickListener(listener); }",,1.0,java
"My question is so simple. Imagine we have a Foo class. and has a T property. public class Foo<T> { public T Property { get; set; } } I want to implement in it IXmlSerializable and my problem is I have no idea to imitate the standard XmlSerialization. I really have no idea about how to write this property like the standart XmlSerlalization. UPDATE: I did this and it's not working public class Foo<T> : IFoo where T : IXmlSerializable { public T Value { get; set; } } public class FooA : Foo<string> { } public class FooB : Foo<int> { } public class FooC : Foo<List<Double>> { } When I serialized this classes (A, B and C). I wanna have something like this: Using first class: <FooA> <Value>StringDemo</Value> </FooA> Using second class: <FooB> <Value>2</Value> </FooB> Using third class: <FooC> <Value> <ArrayOfDouble xsi:..> <Double>3.1416</Double> <Double>4.2</Double> </ArrayOfDouble> </Value> </FooC> Something like this is what I wanted, I don't want to implement in all of this a custom XmlSerializer, I mean the default which the standard serialization use.",,1.0,java
"I tried to place const double NA = 1e-300; in the header file for Cocoa-Touch but Xcode doesn't like it. However when I did this in Cocoa(64 bit MAC), it works just fine. ???????? Errors: Duplicate symbol _NA in /Users/jdl/Library/Developer/Xcode/DerivedData/iExperiment-chcmotcyeolxeacnidtlgofkcihz/Build/Intermediates/iExperiment.build/Debug-iphonesimulator/iExperiment.build/Objects-normal/i386/ViewController.o and /Users/jdl/Library/Developer/Xcode/DerivedData/iExperiment-chcmotcyeolxeacnidtlgofkcihz/Build/Intermediates/iExperiment.build/Debug-iphonesimulator/iExperiment.build/Objects-normal/i386/AppDelegate.o for architecture i386 Command /Developer/Platforms/iPhoneSimulator.platform/Developer/usr/bin/clang failed with exit code 1 #import <UIKit/UIKit.h> const double NA = 1e-300; // <------------- Error in Cocoa-Touch but not Cocoa ?? // So where do I put this so it doesn't error out in Cocoa-Touch?? @interface ViewController : UIViewController { UITextView *tvText; } @end",,6.0,iphone
"""I have an application which receives data from a news website (through rss) and puts articles in a tableview. User can select an article from the list to view the detail on a webview. On the detail view user can click on a button to save data in the database. I'm able to save articles and show them in """"favourites"""" table view but the problem is that for now i'm saving and using the link to the article and loading it when user selects a saved article. While i want to save article and allow user to read it in offline mode. I'm using core data. Is there a way to save the whole webview? """,,177.0,autocomplete
"""I have a micropost feature and was testing the way it formats text that has been posted when displaying back to the user. I pasted the following text like this: and this was displayed back to me: I'm using """"simple_format h(content)"""". When I remove the helper the text is displayed with out a new line from the word """"In"""". It displays as one big paragraph so I assume the helper is working but for some reason my double new lines are being ignored. Any idea what is going on? Am I missing something? """,,920.0,soap
"Your TBase class currently does not implement IDerived, it only implements IBase. Your TDerived class implements both IBase (by inheriting from TBase) and IDerived (explicitly). You should either Change your TBase declaration to TBase =class(TInterfacedObject,IBase, IDerived) and implement the IDerived methods in TBase. or change b:=TBase.Create; to b:=TDerived.Create;",,1.0,java
"I have a function that basically strips numerics out of a selection. This function works great on English based character sets. I have a new set of data that is comprised of non-English characters (Chinese, Japanese, Indian etc.), and I am not getting the desired effect. I am using the NVarchar set to store any records that could be non-English. Below is the function that I am currently using. Any advice would be really helpful. GO /****** Object: UserDefinedFunction [dbo].[StripVenName] Script Date: 05/03/2012 16:24:46 ******/ SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER OFF GO CREATE FUNCTION [dbo].[StripVenName] (@InString as VarChar(8000)) RETURNS NVarChar(100) AS BEGIN Declare @ReturnVal as NVarchar(4000) Declare @OutString as NVarchar(4000) Declare @Pos as smallint Declare @CurChar as NVarChar(1) IF LEN(@InString) = 0 SET @ReturnVal = '' ELSE BEGIN Set @Pos = 1 SET @OutString = ' ' WHILE (@Pos <= Len(@InString)) BEGIN Set @CurChar = SUBSTRING(@InString, @Pos, 1) if ASCII(@CurChar) between 65 and 90 or ASCII(@CurChar) between 97 and 122 SET @OutString = @OutString + @CurChar Set @Pos = @Pos + 1 END if len(@OutString) = 0 SET @OutString = @InString SET @ReturnVal = LTRIM(RTRIM(@OutString)) END RETURN @ReturnVal END",,14.0,sql
"I got some typical jasmine code such as: expect(actual).someMatcher(expected); Is there a way to determine whether this matching succeed or failed (programmatically)? Depending on the result, I might want to do a certain operation afterwards. For example, if it fails, write the actual to disk. Using the matcher's return value doesn't seem to work. It always returns undefined.",,20.0,c


#### 1.5 Summarize which topics are the most popular

In [0]:
# change the column name 
topics = result.withColumnRenamed('decoded', 'topic').select('topic')

# Aggregate the topics and calculate the total qty of each topic
topic_qty = topics.groupBy(col("topic")).agg(count('topic').alias('qty')).orderBy(desc('qty'))
topic_qty.show()

+-------------+---+
|        topic|qty|
+-------------+---+
|           c#|315|
|         java|220|
|    hibernate|180|
|   javascript|178|
|          php|106|
|      android|100|
|       jquery| 83|
|          c++| 82|
|  objective-c| 54|
|       python| 54|
|       iphone| 50|
|        mysql| 43|
|         .net| 38|
|ruby-on-rails| 35|
|   sql-server| 33|
|      asp.net| 33|
|          ios| 32|
|          css| 26|
|         html| 22|
|          sql| 22|
+-------------+---+
only showing top 20 rows



#### 1.6 Save the result file to the `BI` folder


In [0]:
# define this function

def crt_sgl_file(result_path):
        # write the result to a folder container several files
        path = "/mnt/deBDProject/BI/ml_result"
        topic_qty.write.option("delimiter", ",").option("header", "true").mode("overwrite").csv(path)

        # list the folder, find the csv file 
        filenames = dbutils.fs.ls(path)
        name = ''
        for filename in filenames:
            if filename.name.endswith('csv'):
                org_name = filename.name

        # copy the csv file to the path you want to save, in this example, we use  "/mnt/deBDProject/BI/ml_result.csv"
        dbutils.fs.cp(path + '/'+ org_name, result_path)

        # delete the folder
        dbutils.fs.rm(path, True)

        print('single file created')

In [0]:
# run the function
result_path = "/mnt/deBDProject/BI/ml_result.csv"

crt_sgl_file(result_path)

single file created


In [0]:
# the end