# Machine learning - Features extraction

Runs binary and multi-class classifiers on a given dataset.
Dataset are read as Parquet file. The dataset must contain a feature vector named "features" and a classification column.

## Imports

In [1]:
from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer                                 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, MultilayerPerceptronClassifier, RandomForestClassifier

## Configure Spark Session

In [2]:
spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("datasetClassifier") \
                    .getOrCreate()

## Read in data from parquet file

In [3]:
parquetFile = './features/'
data = spark.read.parquet(parquetFile).cache()

print(f"Total number of data: {data.count()}")
data.toPandas().head()

Total number of data: 2174


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,3CE2.A,0.657143,0.026891,0.315966,alpha,"[-0.3523045679806419, -0.008636907063050247, 0..."
1,3CEU.B,0.370558,0.177665,0.451777,alpha+beta,"[-0.38101523799786086, -0.008424269855378917, ..."
2,3CH0.A,0.338235,0.194853,0.466912,alpha+beta,"[-0.29024044258478765, 0.02571477163308111, 0...."
3,3CHH.A,0.68,0.006667,0.313333,alpha,"[-0.45816402270583745, -0.15243948006641064, 0..."
4,3CI0.I,0.313253,0.409639,0.277108,alpha+beta,"[-0.33645407597280363, -0.23910576706244188, 0..."


## Select only alpha and beta foldType

In [4]:
data = data.where((data.foldType == 'alpha') | (data.foldType == 'beta')) #| (data.foldType == 'other'))

print(f"Total number of data: {data.count()}")
data.toPandas().head()

Total number of data: 498


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,3CE2.A,0.657143,0.026891,0.315966,alpha,"[-0.3523045679806419, -0.008636907063050247, 0..."
1,3CHH.A,0.68,0.006667,0.313333,alpha,"[-0.45816402270583745, -0.15243948006641064, 0..."
2,3FH3.A,0.780822,0.013699,0.205479,alpha,"[-0.15787777530954808, -0.004312350444353311, ..."
3,3FHG.A,0.705314,0.0,0.294686,alpha,"[-0.31876067750300596, 0.02605013251621095, 0...."
4,3FYR.B,0.72973,0.0,0.27027,alpha,"[-0.005325594758416744, 0.052406518898428754, ..."


## Basic dataset information and setting

In [5]:
label = 'foldType'
testFraction = 0.3
seed = 123

vector = data.first()["features"]
featureCount = len(vector)
print(f"Feature count    : {featureCount}")
    
classCount = int(data.select(label).distinct().count())
print(f"Class count    : {classCount}")

print(f"Dataset size (unbalanced)    : {data.count()}")
    
data.groupby(label).count().show(classCount)
data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced)  : {data.count()}")
    
data.groupby(label).count().show(classCount)

Feature count    : 50
Class count    : 2
Dataset size (unbalanced)    : 498
+--------+-----+
|foldType|count|
+--------+-----+
|    beta|  119|
|   alpha|  379|
+--------+-----+

Dataset size (balanced)  : 244
+--------+-----+
|foldType|count|
+--------+-----+
|    beta|  119|
|   alpha|  125|
+--------+-----+



## Decision Tree Classifier

In [6]:
dtc = DecisionTreeClassifier()
mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	92	33
beta	79	40

Sample predictions: DecisionTreeClassifier
+----------------+-----------+-----------+----------+--------+--------------------+------------+-------------+-----------+----------+--------------+
|structureChainId|      alpha|       beta|      coil|foldType|            features|indexedLabel|rawPrediction|probability|prediction|predictedLabel|
+----------------+-----------+-----------+----------+--------+--------------------+------------+-------------+-----------+----------+--------------+
|          5MUN.A|0.032967035|  0.5769231| 0.3901099|    beta|[-0.2010753329213...|         1.0|   [0.0,48.0]|  [0.0,1.0]|       1.0|          beta|
|          4L1D.C|0.027522936|  0.4587156|0.51376146|    beta|[-0.5289961680112...|         1.0|   [0.0,17.0]|  [0.0,1.0]|       1.0|          beta|
|          2GFN.B| 0.78238344|        0.0|0.21761657|   alpha|[-0.3541173004128...|         0.0|   [53.0,0.0]|  [1.0,0.0]|       0.0|         alpha|
|          3BS7.B|  

## Random Forest Classifier

In [7]:
rfc = RandomForestClassifier()
mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	92	33
beta	79	40

Sample predictions: RandomForestClassifier
+----------------+-----------+-----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|structureChainId|      alpha|       beta|      coil|foldType|            features|indexedLabel|       rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+-----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|          5MUN.A|0.032967035|  0.5769231| 0.3901099|    beta|[-0.2010753329213...|         1.0|[6.41263736263736...|[0.32063186813186...|       1.0|          beta|
|          4L1D.C|0.027522936|  0.4587156|0.51376146|    beta|[-0.5289961680112...|         1.0|[8.95020212704894...|[0.44751010635244...|       1.0|          beta|
|          2GFN.B| 0.78238344|        0.0|0.21761657|   alpha|[-0.3541173004128...|      

## Logistic Regression Classifier

In [8]:
lr = LogisticRegression()
mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	92	33
beta	79	40

Sample predictions: LogisticRegression
+----------------+-----------+-----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|structureChainId|      alpha|       beta|      coil|foldType|            features|indexedLabel|       rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+-----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|          5MUN.A|0.032967035|  0.5769231| 0.3901099|    beta|[-0.2010753329213...|         1.0|[83.9875027392862...|[1.0,3.3472720527...|       0.0|         alpha|
|          4L1D.C|0.027522936|  0.4587156|0.51376146|    beta|[-0.5289961680112...|         1.0|[-332.03466178834...|[6.29765093113241...|       1.0|          beta|
|          2GFN.B| 0.78238344|        0.0|0.21761657|   alpha|[-0.3541173004128...|         0

## Simple Multilayer Perception Classifier

In [9]:
layers = [featureCount, 10, classCount]
mpc = MultilayerPerceptronClassifier().setLayers(layers) \
                                          .setBlockSize(128) \
                                          .setSeed(1234) \
                                          .setMaxIter(200)
mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	92	33
beta	79	40

Sample predictions: MultilayerPerceptronClassifier
+----------------+-----------+-----------+----------+--------+--------------------+------------+----------+--------------+
|structureChainId|      alpha|       beta|      coil|foldType|            features|indexedLabel|prediction|predictedLabel|
+----------------+-----------+-----------+----------+--------+--------------------+------------+----------+--------------+
|          5MUN.A|0.032967035|  0.5769231| 0.3901099|    beta|[-0.2010753329213...|         1.0|       1.0|          beta|
|          4L1D.C|0.027522936|  0.4587156|0.51376146|    beta|[-0.5289961680112...|         1.0|       1.0|          beta|
|          2GFN.B| 0.78238344|        0.0|0.21761657|   alpha|[-0.3541173004128...|         0.0|       0.0|         alpha|
|          3BS7.B|  0.7866667|        0.0|0.21333334|   alpha|[-0.5531218039815...|         0.0|       0.0|         alpha|
|          4M5E.A|  0.6296296|0.014814815|0.3

## Terminate Spark

In [10]:
spark.stop()