# Machine learning - Features extraction

Runs binary and multi-class classifiers on a given dataset.
Dataset are read as Parquet file. The dataset must contain a feature vector named "features" and a classification column.

## Imports

In [1]:
from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer                                 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, MultilayerPerceptronClassifier, RandomForestClassifier

## Configure Spark Session

In [2]:
spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("datasetClassifier") \
                    .getOrCreate()

## Read in data from parquet file

In [3]:
parquetFile = './intput_features/'
data = spark.read.parquet(parquetFile).cache()

print(f"Total number of data: {data.count()}")
data.toPandas().head()

Total number of data: 9567


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,1RXQ.D,0.662722,0.065089,0.272189,other,"[0.5052433708514281, 0.39907651862371607, -0.3..."
1,1RYL.B,0.550336,0.154362,0.295302,alpha+beta,"[0.2862339211538075, 0.3137033899168832, -0.03..."
2,1RYO.A,0.367284,0.212963,0.419753,alpha+beta,"[0.29367011161578216, 0.35279737824311286, -0...."
3,1RZ3.A,0.423913,0.228261,0.347826,alpha+beta,"[0.271732849823311, 0.30806871705222877, -0.09..."
4,1RZ4.A,0.676056,0.028169,0.295775,alpha,"[0.3562608102709055, 0.5141948204528954, -0.28..."


## Select only alpha and beta foldType

In [4]:
data = data.where((data.foldType == 'alpha') | (data.foldType == 'beta')) #| (data.foldType == 'other'))

print(f"Total number of data: {data.count()}")
data.toPandas().head()

Total number of data: 2390


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,1RZ4.A,0.676056,0.028169,0.295775,alpha,"[0.3562608102709055, 0.5141948204528954, -0.28..."
1,1RZH.L,0.619217,0.035587,0.345196,alpha,"[0.12472582102221037, 0.08776423343058143, -0...."
2,1RZH.M,0.634551,0.039867,0.325581,alpha,"[0.1600593357754786, 0.1681052131085273, -0.18..."
3,1S0P.B,0.755682,0.011364,0.232955,alpha,"[0.3622322499432734, 0.47637778217771226, -0.1..."
4,1S2X.A,0.772222,0.0,0.227778,alpha,"[0.32439254217031527, 0.4800257507938801, 0.05..."


## Basic dataset information and setting

In [5]:
label = 'foldType'
testFraction = 0.1
seed = 123

vector = data.first()["features"]
featureCount = len(vector)
print(f"Feature count    : {featureCount}")
    
classCount = int(data.select(label).distinct().count())
print(f"Class count    : {classCount}")

print(f"Dataset size (unbalanced)    : {data.count()}")
    
data.groupby(label).count().show()
data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced)  : {data.count()}")
    
data.groupby(label).count().show()

Feature count    : 50
Class count    : 2
Dataset size (unbalanced)    : 2390
+--------+-----+
|foldType|count|
+--------+-----+
|    beta|  626|
|   alpha| 1764|
+--------+-----+

Dataset size (balanced)  : 1266
+--------+-----+
|foldType|count|
+--------+-----+
|    beta|  626|
|   alpha|  640|
+--------+-----+



## Random Forest Classifier

In [6]:
rfc = RandomForestClassifier()
mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	573	67
beta	567	59

Sample predictions: RandomForestClassifier
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|structureChainId|      alpha|      beta|      coil|foldType|            features|indexedLabel|       rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|          3HOL.A| 0.03168317| 0.4970297|0.47128713|    beta|[0.52168207759712...|         1.0|[1.64161633738838...|[0.08208081686941...|       1.0|          beta|
|          4NBX.A|        0.0|0.48951048| 0.5104895|    beta|[0.52165241501438...|         1.0|[3.39452362189721...|[0.16972618109486...|       1.0|          beta|
|          5F6L.B| 0.02247191|0.46629214|0.51123595|    beta|[0.38922849420682...|         1

## Logistic Regression Classifier

In [7]:
lr = LogisticRegression()
mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	573	67
beta	567	59

Sample predictions: LogisticRegression
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|structureChainId|      alpha|      beta|      coil|foldType|            features|indexedLabel|       rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|          3HOL.A| 0.03168317| 0.4970297|0.47128713|    beta|[0.52168207759712...|         1.0|[-2.7278323810402...|[0.06135087074178...|       1.0|          beta|
|          4NBX.A|        0.0|0.48951048| 0.5104895|    beta|[0.52165241501438...|         1.0|[-6.8137665640123...|[0.00109734186670...|       1.0|          beta|
|          5F6L.B| 0.02247191|0.46629214|0.51123595|    beta|[0.38922849420682...|         1.0|[

## Simple Multilayer Perception Classifier

In [8]:
layers = [featureCount, 32, 32, classCount]
mpc = MultilayerPerceptronClassifier().setLayers(layers) \
                                          .setBlockSize(128) \
                                          .setSeed(1234) \
                                          .setMaxIter(100)
mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	573	67
beta	567	59

Sample predictions: MultilayerPerceptronClassifier
+----------------+-----------+----------+----------+--------+--------------------+------------+----------+--------------+
|structureChainId|      alpha|      beta|      coil|foldType|            features|indexedLabel|prediction|predictedLabel|
+----------------+-----------+----------+----------+--------+--------------------+------------+----------+--------------+
|          3HOL.A| 0.03168317| 0.4970297|0.47128713|    beta|[0.52168207759712...|         1.0|       1.0|          beta|
|          4NBX.A|        0.0|0.48951048| 0.5104895|    beta|[0.52165241501438...|         1.0|       1.0|          beta|
|          5F6L.B| 0.02247191|0.46629214|0.51123595|    beta|[0.38922849420682...|         1.0|       1.0|          beta|
|          5E9P.A|0.034883723|0.44186047| 0.5232558|    beta|[0.24919270079802...|         1.0|       1.0|          beta|
|          5SV5.A|        0.0| 0.5904762|0.4095238

## Terminate Spark

In [9]:
spark.stop()