# Solution 1
Apply your skills to classify protein foldType with Decision Tree Classifier

## Imports

In [1]:
from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer                                 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import mltoolkit

## Configure Spark Session

In [2]:
spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("datasetClassifierProblemset") \
                    .getOrCreate()

## TODO-1: Read in data from parquet file

In [3]:
parquetFile = './intput_features/'
data = spark.read.parquet(parquetFile).cache()

## TODO-2: Select alpha, beta, alpha+beta foldtypes

In [5]:
data = data.where((data.foldType == 'alpha') |\
                  (data.foldType == 'beta') |\
                  (data.foldType == 'alpha+beta'))
print(f"Total number of data: {data.count()}")

Total number of data: 8514


## TODO-3: Downsample data

In [6]:
label = 'foldType'

data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced)  : {data.count()}")
    
data.groupby(label).count().show()

Dataset size (balanced)  : 1910
+----------+-----+
|  foldType|count|
+----------+-----+
|alpha+beta|  644|
|      beta|  626|
|     alpha|  640|
+----------+-----+



## TODO-4: Decision Tree Classifier with PySpark

In [7]:
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
mcc = SparkMultiClassClassifier(dtc, label, 0.1)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha+beta	573	71
alpha	583	57
beta	550	76

Sample predictions: DecisionTreeClassifier
+----------------+----------+----------+----------+----------+--------------------+------------+-----------------+--------------------+----------+--------------+
|structureChainId|     alpha|      beta|      coil|  foldType|            features|indexedLabel|    rawPrediction|         probability|prediction|predictedLabel|
+----------------+----------+----------+----------+----------+--------------------+------------+-----------------+--------------------+----------+--------------+
|          2C0C.A|0.33994335|0.26062322|0.39943343|alpha+beta|[0.06918420773863...|         0.0|[220.0,66.0,64.0]|[0.62857142857142...|       0.0|    alpha+beta|
|          4AR9.A|       0.5|0.16836734|0.33163264|alpha+beta|[0.42860078792450...|         0.0|    [0.0,2.0,3.0]|       [0.0,0.4,0.6]|       2.0|          beta|
|          4YQD.A| 0.4262295|0.14754099| 0.4262295|alpha+beta|[0.02085487933649...| 

## BONUS: Decision Tree Classifier with sklearn

In [8]:
from sklearn.tree import DecisionTreeClassifier

df = data.toPandas()
dtc = DecisionTreeClassifier()
mcc = mltoolkit.MultiClassClassifier(dtc, 'foldType', testFraction=0.1)
matrics = mcc.fit(df)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test

alpha+beta	578	66

beta	563	63

alpha	578	62

Total time taken: 0.11974024772644043

Methods	DecisionTreeClassifier
F Score	0.5818890986967503
Accuracy	0.581151832460733
Precision	0.5827860505378417
Recall	0.581151832460733
False Positive Rate	0.20999596253229977
True Positive Rate	0.5827631150211795
	
Confusion Matrix
['alpha+beta' 'beta' 'alpha']
[[32 14 20]
 [15 42  6]
 [20  5 37]]


In [9]:
spark.stop()