In [1]:
import pandas as pd
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation


In [None]:
Vectors.sparseの練習

In [36]:
print(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]).toArray())
print(Vectors.sparse(4, [(0, 99), (3, -100)]).toArray())

[ 1.  0.  0. -2.]
[  99.    0.    0. -100.]


In [37]:
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
df = spark.createDataFrame(data, ["features"])


In [40]:
df.show()

+--------------------+
|            features|
+--------------------+
|(4,[0,3],[1.0,-2.0])|
|   [4.0,5.0,0.0,3.0]|
|   [6.0,7.0,0.0,8.0]|
| (4,[0,3],[9.0,1.0])|
+--------------------+



In [42]:
type(Correlation.corr(df, "features"))

pyspark.sql.dataframe.DataFrame

In [43]:
tmp = Correlation.corr(df, "features").head()


pyspark.sql.types.Row

In [41]:
r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))



Pearson correlation matrix:
DenseMatrix([[1.        , 0.05564149,        nan, 0.40047142],
             [0.05564149, 1.        ,        nan, 0.91359586],
             [       nan,        nan, 1.        ,        nan],
             [0.40047142, 0.91359586,        nan, 1.        ]])
Spearman correlation matrix:
DenseMatrix([[1.        , 0.10540926,        nan, 0.4       ],
             [0.10540926, 1.        ,        nan, 0.9486833 ],
             [       nan,        nan, 1.        ,        nan],
             [0.4       , 0.9486833 ,        nan, 1.        ]])


In [1]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors

columns = ["label", "features"]
# Prepare training data from a list of (label, features) tuples.
training = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

In [6]:
training.toPandas()
training.show(n=20, truncate=True, vertical=False)
training.show(n=20, truncate=True, vertical=True)

-RECORD 0------------------
 label    | 1.0            
 features | [0.0,1.1,0.1]  
-RECORD 1------------------
 label    | 0.0            
 features | [2.0,1.0,-1.0] 
-RECORD 2------------------
 label    | 0.0            
 features | [2.0,1.3,1.0]  
-RECORD 3------------------
 label    | 1.0            
 features | [0.0,1.2,-0.5] 



In [9]:
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
# print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

In [10]:
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

In [11]:
# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())

Model 1 was fit using parameters: 
{Param(parent='LogisticRegression_c59d6c3cdea0', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2, Param(parent='LogisticRegression_c59d6c3cdea0', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LogisticRegression_c59d6c3cdea0', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'): 'auto', Param(parent='LogisticRegression_c59d6c3cdea0', name='featuresCol', doc='features column name.'): 'features', Param(parent='LogisticRegression_c59d6c3cdea0', name='fitIntercept', doc='whether to fit an intercept term.'): True, Param(parent='LogisticRegression_c59d6c3cdea0', name='labelCol', doc='label column name.'): 'label', Param(parent='LogisticRegression_c59d6c3cdea0', name='maxBlockSizeInMB', do

In [21]:
# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30  # Specify 1 Param, overwriting the original maxIter.
# Specify multiple Params.
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55})  # type: ignore


In [35]:
# You can combine paramMaps, which are python dictionaries.
# Change output column name
paramMap2 = {lr.probabilityCol: "myProbability"}  # type: ignore
paramMapCombined = paramMap.copy()
paramMapCombined.update(paramMap2)  # type: ignore

In [52]:
from pprint import pprint
pprint(paramMapCombined,indent=4)

{   Param(parent='LogisticRegression_c59d6c3cdea0', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'myProbability',
    Param(parent='LogisticRegression_c59d6c3cdea0', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
    Param(parent='LogisticRegression_c59d6c3cdea0', name='threshold', doc='Threshold in binary classification prediction, in range [0, 1]. If threshold and thresholds are both set, they must match.e.g. if threshold is p, then thresholds must be equal to [1-p, p].'): 0.55,
    Param(parent='LogisticRegression_c59d6c3cdea0', name='maxIter', doc='max number of iterations (>= 0).'): 30}


In [53]:
# Now learn a new model using the paramMapCombined parameters.
# paramMapCombined overrides all parameters set earlier via lr.set* methods.
model2 = lr.fit(training, paramMapCombined)
print("Model 2 was fit using parameters: ")
print(model2.extractParamMap())

Model 2 was fit using parameters: 
{Param(parent='LogisticRegression_c59d6c3cdea0', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2, Param(parent='LogisticRegression_c59d6c3cdea0', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LogisticRegression_c59d6c3cdea0', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'): 'auto', Param(parent='LogisticRegression_c59d6c3cdea0', name='featuresCol', doc='features column name.'): 'features', Param(parent='LogisticRegression_c59d6c3cdea0', name='fitIntercept', doc='whether to fit an intercept term.'): True, Param(parent='LogisticRegression_c59d6c3cdea0', name='labelCol', doc='label column name.'): 'label', Param(parent='LogisticRegression_c59d6c3cdea0', name='maxBlockSizeInMB', do

In [54]:
# Prepare test data
test = spark.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])

In [57]:
prediction.show(2)

+-----+--------------+--------------------+--------------------+----------+
|label|      features|       rawPrediction|       myProbability|prediction|
+-----+--------------+--------------------+--------------------+----------+
|  1.0|[-1.0,1.5,1.3]|[-2.8046567890310...|[0.05707304993572...|       1.0|
|  0.0|[3.0,2.0,-0.1]|[2.49587585164645...|[0.92385219564432...|       0.0|
+-----+--------------+--------------------+--------------------+----------+
only showing top 2 rows



In [55]:
# Make predictions on test data using the Transformer.transform() method.
# LogisticRegression.transform will only use the 'features' column.
# Note that model2.transform() outputs a "myProbability" column instead of the usual
# 'probability' column since we renamed the lr.probabilityCol parameter previously.
prediction = model2.transform(test)
result = prediction.select("features", "label", "myProbability", "prediction") \
    .collect()

for row in result:
    print("features=%s, label=%s -> prob=%s, prediction=%s"
          % (row.features, row.label, row.myProbability, row.prediction))

features=[-1.0,1.5,1.3], label=1.0 -> prob=[0.0570730499357254,0.9429269500642746], prediction=1.0
features=[3.0,2.0,-0.1], label=0.0 -> prob=[0.9238521956443227,0.07614780435567725], prediction=0.0
features=[0.0,2.2,-1.5], label=1.0 -> prob=[0.10972780286187778,0.8902721971381222], prediction=1.0
