# Machine learning - Features extraction

Runs binary and multi-class classifiers on a given dataset.
Dataset are read as Parquet file. The dataset must contain a feature vector named "features" and a classification column.

## Imports

In [28]:
from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer                                 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, MultilayerPerceptronClassifier, RandomForestClassifier

## Configure Spark Session

In [29]:
spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("datasetClassifier") \
                    .getOrCreate()

## Read in data from parquet file

In [30]:
parquetFile = './features/'
data = spark.read.parquet(parquetFile).cache()

print(f"Total number of data: {data.count()}")
data.toPandas().head()

Total number of data: 10660


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,1RXQ.D,0.662722,0.065089,0.272189,alpha+beta,"[-0.49331762155486364, 0.2271454499266242, 0.0..."
1,1RYL.B,0.550336,0.154362,0.295302,alpha+beta,"[-0.3640512899013169, 0.18834858465692905, -0...."
2,1RYO.A,0.367284,0.212963,0.419753,alpha+beta,"[-0.3282523089562731, -0.01581098698508109, -0..."
3,1RYP.E,0.35124,0.260331,0.38843,alpha+beta,"[-0.48476110234918435, 0.09152798528330887, 0...."
4,1RYP.N,0.321888,0.313305,0.364807,alpha+beta,"[-0.21472580679531755, 0.2948544846309734, 0.0..."


## Select only alpha and beta foldType

In [31]:
data = data.where((data.foldType == 'alpha') | (data.foldType == 'beta')) #| (data.foldType == 'other'))

print(f"Total number of data: {data.count()}")
data.toPandas().head()

Total number of data: 2584


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,1RZ4.A,0.676056,0.028169,0.295775,alpha,"[-0.46955591383079687, 0.2041911859748264, 0.0..."
1,1RZH.L,0.619217,0.035587,0.345196,alpha,"[-0.4564491557389764, 0.1894280703239409, -0.0..."
2,1RZH.M,0.634551,0.039867,0.325581,alpha,"[-0.5367502944172646, 0.17530410278290268, -0...."
3,1S0P.B,0.755682,0.011364,0.232955,alpha,"[-0.417293982516442, 0.29370996700067603, 0.15..."
4,1S2X.A,0.772222,0.0,0.227778,alpha,"[-0.21703443489968777, 0.2690418922251499, 0.1..."


## Basic dataset information and setting

In [72]:
label = 'foldType'
testFraction = 0.1
seed = 123

vector = data.first()["features"]
featureCount = len(vector)
print(f"Feature count    : {featureCount}")
    
classCount = int(data.select(label).distinct().count())
print(f"Class count    : {classCount}")

print(f"Dataset size (unbalanced)    : {data.count()}")
    
data.groupby(label).count().show(classCount)
data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced)  : {data.count()}")
    
data.groupby(label).count().show(classCount)

Feature count    : 50
Class count    : 2
Dataset size (unbalanced)    : 1323
+--------+-----+
|foldType|count|
+--------+-----+
|    beta|  660|
|   alpha|  663|
+--------+-----+

Dataset size (balanced)  : 1319
+--------+-----+
|foldType|count|
+--------+-----+
|    beta|  660|
|   alpha|  659|
+--------+-----+



## Decision Tree Classifier

In [73]:
dtc = DecisionTreeClassifier()
mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
beta	598	62
alpha	589	70

Sample predictions: DecisionTreeClassifier
+----------------+-----------+-----------+----------+--------+--------------------+------------+-------------+--------------------+----------+--------------+
|structureChainId|      alpha|       beta|      coil|foldType|            features|indexedLabel|rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+-----------+----------+--------+--------------------+------------+-------------+--------------------+----------+--------------+
|          3KHQ.A|0.032258064| 0.50537634| 0.4623656|    beta|[-0.2263692109704...|         0.0| [284.0,13.0]|[0.95622895622895...|       0.0|          beta|
|          5LS7.A|        0.0|       0.64|      0.36|    beta|[-0.6489516260102...|         0.0|   [17.0,8.0]|         [0.68,0.32]|       0.0|          beta|
|          3SGR.C|        0.0|  0.8333333|0.16666667|    beta|[-0.3402115559826...|         0.0|    [3.0,1.0]|         [0.

## Random Forest Classifier

In [74]:
rfc = RandomForestClassifier()
mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
beta	598	62
alpha	589	70

Sample predictions: RandomForestClassifier
+----------------+-----------+-----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|structureChainId|      alpha|       beta|      coil|foldType|            features|indexedLabel|       rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+-----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|          3KHQ.A|0.032258064| 0.50537634| 0.4623656|    beta|[-0.2263692109704...|         0.0|[17.3599575895132...|[0.86799787947566...|       0.0|          beta|
|          5LS7.A|        0.0|       0.64|      0.36|    beta|[-0.6489516260102...|         0.0|[12.6165559555537...|[0.63082779777768...|       0.0|          beta|
|          3SGR.C|        0.0|  0.8333333|0.16666667|    beta|[-0.3402115559826...|    

## Logistic Regression Classifier

In [76]:
lr = LogisticRegression()
mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
beta	598	62
alpha	589	70

Sample predictions: LogisticRegression
+----------------+-----------+-----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|structureChainId|      alpha|       beta|      coil|foldType|            features|indexedLabel|       rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+-----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|          3KHQ.A|0.032258064| 0.50537634| 0.4623656|    beta|[-0.2263692109704...|         0.0|[1.67549553269351...|[0.84230714368222...|       0.0|          beta|
|          5LS7.A|        0.0|       0.64|      0.36|    beta|[-0.6489516260102...|         0.0|[3.64497496478225...|[0.9745429274563,...|       0.0|          beta|
|          3SGR.C|        0.0|  0.8333333|0.16666667|    beta|[-0.3402115559826...|        

## Simple Multilayer Perception Classifier

In [None]:
layers = [featureCount, 32, 32, classCount]
mpc = MultilayerPerceptronClassifier().setLayers(layers) \
                                          .setBlockSize(128) \
                                          .setSeed(1234) \
                                          .setMaxIter(100)
mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
beta	598	62
alpha	589	70


## Terminate Spark

In [None]:
spark.stop()