# Salary prediction by vacancy description

## Reading dataset

Init pyspark session

In [1]:
from __future__ import division, print_function, unicode_literals # For the compatibility with Python 2

In [2]:
from pyspark.sql import SparkSession
spark_session = SparkSession.builder\
                            .enableHiveSupport()\
                            .appName("spark sql")\
                            .master("local[4]")\
                            .getOrCreate()

Load train dataset placed at `/data/vacancie` with at least 10 partitions (use function `repartition` for this case)

In [39]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([
        StructField("Elevation", IntegerType(), False),
        StructField("Aspect", IntegerType(), False),
        StructField("Slope", IntegerType(), False),
       StructField("Horizontal_Distance_To_Hydrology", IntegerType(), False),
       StructField("Vertical_Distance_To_Hydrology", IntegerType(), False),
       StructField("Horizontal_Distance_To_Roadways", IntegerType(), False),
       StructField("Hillshade_9am", IntegerType(), False),
       StructField("Hillshade_Noon", IntegerType(), False),
       StructField("Hillshade_3pm", IntegerType(), False),
       StructField("Horizontal_Distance_To_Fire_Points", IntegerType(), False),
       StructField("Wild_Type", StringType(), False),
       StructField("Soil_Type", StringType(), False),
       StructField("Target", IntegerType(), False)
    ])

In [40]:
trees = spark_session.read.csv("/data/covertype2", header="true",schema=schema)

In [42]:
trees.show(2,truncate = False)

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+---------+---------------------------------------------------------------------------+------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm|Horizontal_Distance_To_Fire_Points|Wild_Type|Soil_Type                                                                  |Target|
+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+---------+---------------------------------------------------------------------------+------+
|3122     |266   |10   |433                             |75                            |3069                           |195 

## Transforming dataset

In [43]:
from pyspark.ml.feature import StringIndexer

stringIndexer = StringIndexer(inputCol = "Wild_Type",outputCol="Wild_Type_si")
trees_2 = stringIndexer.fit(trees).transform(trees)

In [44]:
stringIndexer2 = StringIndexer(inputCol = "Soil_Type",outputCol="Soil_Type_si")
trees_3 = stringIndexer2.fit(trees_2).transform(trees_2)

In [45]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCol="Wild_Type_si", outputCol="Wild_Type_oh")
tree_oh = encoder.transform(trees_3)

encoder = OneHotEncoder(inputCol="Soil_Type_si", outputCol="Soil_Type_oh")
tree_oh_2 = encoder.transform(tree_oh)

In [60]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
assembler = VectorAssembler(
    inputCols=["Elevation","Aspect","Slope","Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology","Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_Noon","Hillshade_3pm","Horizontal_Distance_To_Fire_Points","Wild_Type_oh","Soil_Type_oh"], outputCol="features")

data = assembler.transform(tree_oh_2)

In [61]:
data_final = data.select("target","features")

# Fitting model

Split the dataset to train and validation part (it is better to use 90% for the train part and 10% for the validation part)

In [62]:
data_final.show(truncate=False)

+------+---------------------------------------------------------------------------------------------------------+
|target|features                                                                                                 |
+------+---------------------------------------------------------------------------------------------------------+
|1     |(52,[0,1,2,3,4,5,6,7,8,9,11,15],[3122.0,266.0,10.0,433.0,75.0,3069.0,195.0,245.0,188.0,451.0,1.0,1.0])   |
|1     |(52,[0,1,2,3,4,5,6,7,8,9,10,13],[3018.0,308.0,15.0,60.0,14.0,5359.0,177.0,229.0,192.0,4546.0,1.0,1.0])   |
|2     |(52,[0,1,2,3,4,5,6,7,8,9,10,13],[3146.0,151.0,12.0,541.0,-2.0,5887.0,236.0,240.0,132.0,1371.0,1.0,1.0])  |
|2     |(52,[0,1,2,3,4,5,6,7,8,9,10,13],[2980.0,163.0,6.0,553.0,21.0,3538.0,226.0,242.0,149.0,1087.0,1.0,1.0])   |
|2     |(52,[0,1,2,3,4,5,6,7,8,9,10,19],[2972.0,187.0,16.0,255.0,109.0,6390.0,220.0,250.0,158.0,4119.0,1.0,1.0]) |
|2     |(52,[0,1,2,3,4,5,6,7,8,9,11,22],[2768.0,17.0,13.0,30.0,4.0,3140.0,209.0,

In [63]:
trainData,testData = data_final.randomSplit([0.7,0.3],seed = 23)

Fit the Logistic Regression to the model on the splitted train part. Use about 15 iterations for the training process.

<b>Hint.</b> Use regularization parameter in order to prevent overfitting.

In [53]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel

In [67]:
rf = RandomForestClassifier(labelCol='target',featuresCol= "features",numTrees=100)

In [68]:
rfModel = rf.fit(trainData)

In [69]:
predictions = rfModel.transform(testData)

In [70]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol = "target", predictionCol="prediction")
evaluator.evaluate(predictions)

0.6469443319580774

# Performing test submission

In [None]:
# Load dataset

In [None]:
# Transform dataset and calculate auc-roc

In [None]:
# Output for the AUC-ROC