# Solution 1
Apply your skills to classify protein foldType with Decision Tree Classifier

## Imports

In [1]:
from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer                                 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import mltoolkit

## Configure Spark Session

In [2]:
spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("datasetClassifierProblemset") \
                    .getOrCreate()

## TODO-1: Read in data from parquet file

In [3]:
parquetFile = './intput_features/'
data = spark.read.parquet(parquetFile).cache()

## TODO-2: Select alpha, beta, alpha+beta foldtypes

In [4]:
data = data.where((data.foldType == 'alpha') |\
                  (data.foldType == 'beta') |\
                  (data.foldType == 'alpha+beta'))
print(f"Total number of data: {data.count()}")

Total number of data: 7442


## TODO-3: Downsample data

In [5]:
label = 'foldType'

data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced)  : {data.count()}")
    
data.groupby(label).count().show()

Dataset size (balanced)  : 1902
+----------+-----+
|  foldType|count|
+----------+-----+
|alpha+beta|  636|
|      beta|  626|
|     alpha|  640|
+----------+-----+



## TODO-4: Decision Tree Classifier with PySpark

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
mcc = SparkMultiClassClassifier(dtc, label, 0.1)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha	583	57
alpha+beta	566	70
beta	550	76

Sample predictions: DecisionTreeClassifier
+----------------+----------+----------+----------+----------+--------------------+------------+-----------------+--------------------+----------+--------------+
|structureChainId|     alpha|      beta|      coil|  foldType|            features|indexedLabel|    rawPrediction|         probability|prediction|predictedLabel|
+----------------+----------+----------+----------+----------+--------------------+------------+-----------------+--------------------+----------+--------------+
|          2B0T.A|0.47891158|0.16734694| 0.3537415|alpha+beta|[0.41829884112932...|         1.0|  [80.0,42.0,8.0]|[0.61538461538461...|       0.0|         alpha|
|          4KMR.B| 0.4082397|0.17602997|0.41573033|alpha+beta|[0.40662341538856...|         1.0|    [9.0,1.0,2.0]|[0.75,0.083333333...|       0.0|         alpha|
|          5F9P.B|0.34177214|0.37130803|0.28691983|alpha+beta|[0.46898571924927...| 

## BONUS: Decision Tree Classifier with sklearn

In [7]:
from sklearn.tree import DecisionTreeClassifier

df = data.toPandas()
dtc = DecisionTreeClassifier()
mcc = mltoolkit.MultiClassClassifier(dtc, 'foldType', testFraction=0.1)
matrics = mcc.fit(df)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test

alpha+beta	565	71

beta	571	55

alpha	575	65

Total time taken: 0.13357996940612793

Methods	DecisionTreeClassifier
F Score	0.6127026401916457
Accuracy	0.612565445026178
Precision	0.6129813662637034
Recall	0.612565445026178
False Positive Rate	0.19603952692187987
True Positive Rate	0.6139991463935126
	
Confusion Matrix
['alpha+beta' 'beta' 'alpha']
[[43 13 15]
 [10 35 10]
 [20  6 39]]


In [8]:
spark.stop()