# Solution 1
Apply your skills to classify protein foldType with Decision Tree Classifier

## Imports

In [1]:
from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer                                 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import mltoolkit

## Configure Spark Session

In [2]:
spark = SparkSession.builder \
                    .master("local[4]") \
                    .appName("datasetClassifierProblemset") \
                    .getOrCreate()

## TODO-1: Read in data from parquet file

In [3]:
parquetFile = './intput_features/'
data = spark.read.parquet(parquetFile).cache()

## TODO-2: Select alpha, beta, alpha+beta foldtypes

In [4]:
data = data.where((data.foldType == 'alpha') |\
                  (data.foldType == 'beta') |\
                  (data.foldType == 'alpha+beta'))
print(f"Total number of data: {data.count()}")

Total number of data: 14443


## TODO-3: Downsample data

In [5]:
label = 'foldType'

data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced)  : {data.count()}")
    
data.groupby(label).count().show()

Dataset size (balanced)  : 3777
+----------+-----+
|  foldType|count|
+----------+-----+
|alpha+beta| 1290|
|      beta| 1253|
|     alpha| 1234|
+----------+-----+



## TODO-4: Decision Tree Classifier with PySpark

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
mcc = SparkMultiClassClassifier(dtc, label, 0.1)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha+beta	1150	140
beta	1128	125
alpha	1099	135

Sample predictions: DecisionTreeClassifier
+----------------+----------+----------+----------+----------+--------------------+------------+------------------+--------------------+----------+--------------+
|structureChainId|     alpha|      beta|      coil|  foldType|            features|indexedLabel|     rawPrediction|         probability|prediction|predictedLabel|
+----------------+----------+----------+----------+----------+--------------------+------------+------------------+--------------------+----------+--------------+
|          1P1M.A|0.36386138|0.23514852| 0.4009901|alpha+beta|[-0.2208177057692...|         0.0|[342.0,56.0,105.0]|[0.67992047713717...|       0.0|    alpha+beta|
|          2HJ1.B|      0.15|      0.35|       0.5|alpha+beta|[-0.1719372452062...|         0.0|[342.0,56.0,105.0]|[0.67992047713717...|       0.0|    alpha+beta|
|          2XED.B| 0.4512195|  0.199187| 0.3495935|alpha+beta|[-0.2095412

## BONUS: Decision Tree Classifier with sklearn

In [7]:
from sklearn.tree import DecisionTreeClassifier

df = data.toPandas()
dtc = DecisionTreeClassifier()
mcc = mltoolkit.MultiClassClassifier(dtc, 'foldType', testFraction=0.1)
matrics = mcc.fit(df)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test

alpha+beta	1152	138

beta	1128	125

alpha	1119	115

Total time taken: 0.3240659236907959

Methods	DecisionTreeClassifier
F Score	0.73018623821153
Accuracy	0.7328042328042328
Precision	0.7316380681625885
Recall	0.7328042328042328
False Positive Rate	0.13359262153690984
True Positive Rate	0.737487922705314
	
Confusion Matrix
['alpha+beta' 'beta' 'alpha']
[[ 85  23  30]
 [ 12 105   8]
 [ 22   6  87]]


In [8]:
spark.stop()