# Solution 1
Apply your skills to classify protein foldType with Decision Tree Classifier

## Imports

In [1]:
from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer                                 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import mltoolkit

## Configure Spark Session

In [2]:
spark = SparkSession.builder.appName("Solution-1").getOrCreate()

## TODO-1: Read in data from parquet file

In [3]:
parquetFile = './input_features/'
data = spark.read.parquet(parquetFile).cache()

## TODO-2: Select alpha, beta, alpha+beta foldtypes

In [4]:
data = data.where((data.foldType == 'alpha') |\
                  (data.foldType == 'beta') |\
                  (data.foldType == 'alpha+beta'))
print(f"Total number of data: {data.count()}")

Total number of data: 14443


## TODO-3: Downsample data

In [5]:
label = 'foldType'

data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced)  : {data.count()}")
    
data.groupby(label).count().show()

Dataset size (balanced)  : 3777
+----------+-----+
|  foldType|count|
+----------+-----+
|alpha+beta| 1290|
|      beta| 1253|
|     alpha| 1234|
+----------+-----+



## TODO-4: Decision Tree Classifier with PySpark

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
mcc = SparkMultiClassClassifier(dtc, label, 0.1)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha+beta	1150	140
beta	1128	125
alpha	1099	135

Sample predictions: DecisionTreeClassifier
+----------------+----------+----------+----------+----------+--------------------+------------+------------------+--------------------+----------+--------------+
|structureChainId|     alpha|      beta|      coil|  foldType|            features|indexedLabel|     rawPrediction|         probability|prediction|predictedLabel|
+----------------+----------+----------+----------+----------+--------------------+------------+------------------+--------------------+----------+--------------+
|          1P1M.A|0.36386138|0.23514852| 0.4009901|alpha+beta|[-0.0362324150647...|         0.0| [166.0,15.0,52.0]|[0.71244635193133...|       0.0|    alpha+beta|
|          2HJ1.B|      0.15|      0.35|       0.5|alpha+beta|[0.10243093885947...|         0.0| [113.0,36.0,43.0]|[0.58854166666666...|       0.0|    alpha+beta|
|          2XED.B| 0.4512195|  0.199187| 0.3495935|alpha+beta|[0.11892006

## BONUS: Decision Tree Classifier with sklearn

In [7]:
from sklearn.tree import DecisionTreeClassifier

df = data.toPandas()
dtc = DecisionTreeClassifier()
mcc = mltoolkit.MultiClassClassifier(dtc, 'foldType', testFraction=0.1)
matrics = mcc.fit(df)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test

alpha+beta	1174	116

beta	1131	122

alpha	1094	140

Total time taken: 0.4359261989593506

Methods	DecisionTreeClassifier
F Score	0.7575189308337245
Accuracy	0.7566137566137566
Precision	0.7588633356499631
Recall	0.7566137566137566
False Positive Rate	0.12142492970470631
True Positive Rate	0.7545775121806778
	
Confusion Matrix
['alpha+beta' 'beta' 'alpha']
[[ 81  14  21]
 [ 18  96   8]
 [ 23   8 109]]


In [8]:
spark.stop()