# Machine learning - Features extraction

Runs binary and multi-class classifiers on a given dataset.
Dataset are read as Parquet file. The dataset must contain a feature vector named "features" and a classification column.

## Imports

In [5]:
from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer                                 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, MultilayerPerceptronClassifier, RandomForestClassifier

## Configure Spark Session

In [6]:
spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("datasetClassifier") \
                    .getOrCreate()

## Read in data from parquet file

In [7]:
parquetFile = './features/'
data = spark.read.parquet(parquetFile).cache()

print(f"Total number of data: {data.count()}")
data.toPandas().head()

Total number of data: 10660


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,1RXQ.D,0.662722,0.065089,0.272189,alpha+beta,"[0.03355525744721125, 0.2672737870460092, 0.44..."
1,1RYL.B,0.550336,0.154362,0.295302,alpha+beta,"[-0.03064694956326521, -0.019814661751692015, ..."
2,1RYO.A,0.367284,0.212963,0.419753,alpha+beta,"[0.13665451809901036, 0.06933704541390476, 0.3..."
3,1RYP.E,0.35124,0.260331,0.38843,alpha+beta,"[-0.19166820620754696, -0.035750856561223, 0.3..."
4,1RYP.N,0.321888,0.313305,0.364807,alpha+beta,"[0.12672561553009817, 0.018070112488328897, 0...."


## Select only alpha and beta foldType

In [8]:
data = data.where((data.foldType == 'alpha') | (data.foldType == 'beta')) #| (data.foldType == 'other'))

print(f"Total number of data: {data.count()}")
data.toPandas().head()

Total number of data: 2584


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,1RZ4.A,0.676056,0.028169,0.295775,alpha,"[-0.08727558182345496, 0.07827300824017988, 0...."
1,1RZH.L,0.619217,0.035587,0.345196,alpha,"[-0.034122437554677684, -0.08903528297419792, ..."
2,1RZH.M,0.634551,0.039867,0.325581,alpha,"[-0.141879534431021, 0.019542623172800234, 0.5..."
3,1S0P.B,0.755682,0.011364,0.232955,alpha,"[0.31434613252324717, 0.04741660307294556, 0.3..."
4,1S2X.A,0.772222,0.0,0.227778,alpha,"[0.2737482454332455, 0.09691677558712843, -0.0..."


## Basic dataset information and setting

In [9]:
label = 'foldType'
testFraction = 0.1
seed = 123

vector = data.first()["features"]
featureCount = len(vector)
print(f"Feature count    : {featureCount}")
    
classCount = int(data.select(label).distinct().count())
print(f"Class count    : {classCount}")

print(f"Dataset size (unbalanced)    : {data.count()}")
    
data.groupby(label).count().show(classCount)
data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced)  : {data.count()}")
    
data.groupby(label).count().show(classCount)

Feature count    : 50
Class count    : 2
Dataset size (unbalanced)    : 2584
+--------+-----+
|foldType|count|
+--------+-----+
|    beta|  660|
|   alpha| 1924|
+--------+-----+

Dataset size (balanced)  : 1323
+--------+-----+
|foldType|count|
+--------+-----+
|    beta|  660|
|   alpha|  663|
+--------+-----+



## Decision Tree Classifier

In [10]:
dtc = DecisionTreeClassifier()
mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	615	48
beta	598	62

Sample predictions: DecisionTreeClassifier
+----------------+-----------+----------+----------+--------+--------------------+------------+-------------+--------------------+----------+--------------+
|structureChainId|      alpha|      beta|      coil|foldType|            features|indexedLabel|rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+----------+----------+--------+--------------------+------------+-------------+--------------------+----------+--------------+
|          3KHQ.A|0.032258064|0.50537634| 0.4623656|    beta|[0.17156204006592...|         1.0|  [16.0,76.0]|[0.17391304347826...|       1.0|          beta|
|          5LS7.A|        0.0|      0.64|      0.36|    beta|[-0.6127254480496...|         1.0|  [16.0,76.0]|[0.17391304347826...|       1.0|          beta|
|          3SGR.C|        0.0| 0.8333333|0.16666667|    beta|[0.19314044434577...|         1.0|   [29.0,0.0]|           [1.0,0.

## Random Forest Classifier

In [11]:
rfc = RandomForestClassifier()
mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	615	48
beta	598	62

Sample predictions: RandomForestClassifier
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|structureChainId|      alpha|      beta|      coil|foldType|            features|indexedLabel|       rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|          3KHQ.A|0.032258064|0.50537634| 0.4623656|    beta|[0.17156204006592...|         1.0|[6.67941349880415...|[0.33397067494020...|       1.0|          beta|
|          5LS7.A|        0.0|      0.64|      0.36|    beta|[-0.6127254480496...|         1.0|[12.5719984003061...|[0.62859992001530...|       0.0|         alpha|
|          3SGR.C|        0.0| 0.8333333|0.16666667|    beta|[0.19314044434577...|         1

## Logistic Regression Classifier

In [12]:
lr = LogisticRegression()
mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	615	48
beta	598	62

Sample predictions: LogisticRegression
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|structureChainId|      alpha|      beta|      coil|foldType|            features|indexedLabel|       rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|          3KHQ.A|0.032258064|0.50537634| 0.4623656|    beta|[0.17156204006592...|         1.0|[-0.9318006643910...|[0.28255954157708...|       1.0|          beta|
|          5LS7.A|        0.0|      0.64|      0.36|    beta|[-0.6127254480496...|         1.0|[0.10762651598716...|[0.52688068637629...|       0.0|         alpha|
|          3SGR.C|        0.0| 0.8333333|0.16666667|    beta|[0.19314044434577...|         1.0|[

## Simple Multilayer Perception Classifier

In [13]:
layers = [featureCount, 32, 32, classCount]
mpc = MultilayerPerceptronClassifier().setLayers(layers) \
                                          .setBlockSize(128) \
                                          .setSeed(1234) \
                                          .setMaxIter(100)
mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	615	48
beta	598	62

Sample predictions: MultilayerPerceptronClassifier
+----------------+-----------+----------+----------+--------+--------------------+------------+----------+--------------+
|structureChainId|      alpha|      beta|      coil|foldType|            features|indexedLabel|prediction|predictedLabel|
+----------------+-----------+----------+----------+--------+--------------------+------------+----------+--------------+
|          3KHQ.A|0.032258064|0.50537634| 0.4623656|    beta|[0.17156204006592...|         1.0|       1.0|          beta|
|          5LS7.A|        0.0|      0.64|      0.36|    beta|[-0.6127254480496...|         1.0|       0.0|         alpha|
|          3SGR.C|        0.0| 0.8333333|0.16666667|    beta|[0.19314044434577...|         1.0|       1.0|          beta|
|          3IT5.B|0.022099448| 0.3480663|0.62983423|    beta|[0.01785488943029...|         1.0|       1.0|          beta|
|          3PDG.A|        0.0| 0.5934066| 0.406593

## Terminate Spark

In [None]:
spark.stop()