# Machine learning - Features extraction

Runs binary and multi-class classifiers on a given dataset.
Dataset are read as Parquet file. The dataset must contain a feature vector named "features" and a classification column.

## Imports

In [1]:
import pandas as pd
import mltoolkit
from pyspark.sql import SparkSession
import numpy as np
import sklearn

## Configure Spark Session

In [2]:
spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("datasetClassifier") \
                    .getOrCreate()

## Read in data from parquet file

In [3]:
parquetFile = './features/'
data = spark.read.parquet(parquetFile).cache()

df = data.toPandas()
print(f"Total number of data: {df.shape[0]}")
df.head()

Total number of data: 10660


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,1RXQ.D,0.662722,0.065089,0.272189,alpha+beta,"[0.03355525744721125, 0.2672737870460092, 0.44..."
1,1RYL.B,0.550336,0.154362,0.295302,alpha+beta,"[-0.03064694956326521, -0.019814661751692015, ..."
2,1RYO.A,0.367284,0.212963,0.419753,alpha+beta,"[0.13665451809901036, 0.06933704541390476, 0.3..."
3,1RYP.E,0.35124,0.260331,0.38843,alpha+beta,"[-0.19166820620754696, -0.035750856561223, 0.3..."
4,1RYP.N,0.321888,0.313305,0.364807,alpha+beta,"[0.12672561553009817, 0.018070112488328897, 0...."


## Select only alpha and beta foldType

In [4]:
df = df[(df.foldType == 'alpha') | (df.foldType == 'beta')]

print(f"Total number of data: {df.shape[0]}")
df.head()

Total number of data: 2584


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
7,1RZ4.A,0.676056,0.028169,0.295775,alpha,"[-0.08727558182345496, 0.07827300824017988, 0...."
8,1RZH.L,0.619217,0.035587,0.345196,alpha,"[-0.034122437554677684, -0.08903528297419792, ..."
9,1RZH.M,0.634551,0.039867,0.325581,alpha,"[-0.141879534431021, 0.019542623172800234, 0.5..."
11,1S0P.B,0.755682,0.011364,0.232955,alpha,"[0.31434613252324717, 0.04741660307294556, 0.3..."
16,1S2X.A,0.772222,0.0,0.227778,alpha,"[0.2737482454332455, 0.09691677558712843, -0.0..."


## Set features to be list of float instead of string 

In [5]:
df.features = df.features.apply(lambda x: np.array(list(x), dtype=np.float))

## Basic dataset information and setting

In [6]:
label = 'foldType'
testFraction = 0.1
seed = 123


vector = df.features.iloc[0]
featureCount = len(vector)
print(f"Feature count    : {featureCount}")

 
classCount = df.foldType.unique().size
print(f"Class count    : {classCount}\n")

  
print(f"Dataset size (unbalanced)    : {df.shape[0]}")
print(df.foldType.value_counts())


df = mltoolkit.downsample(df, 'foldType')
print(f"\nDataset size (balanced)  : {df.shape[0]}")
print(df.foldType.value_counts())

Feature count    : 50
Class count    : 2

Dataset size (unbalanced)    : 2584
alpha    1924
beta      660
Name: foldType, dtype: int64

Dataset size (balanced)  : 1320
beta     660
alpha    660
Name: foldType, dtype: int64


## Decision Tree Classifier

In [8]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
mcc = mltoolkit.MultiClassClassifier(clf, 'foldType', testFraction=0.1)
matrics = mcc.fit(df)
for k,v in matrics.items(): print(f"{k}\t{v}")
    
print("Sample predictions: ")
mcc.prediction.head(10)


 Class	Train	Test

alpha	590	70

beta	598	62
Sample predictions: 

Total time taken: 0.07261228561401367
Methods	DecisionTreeClassifier
AUC	0.752995391705069
F Score	0.7241379310344828
Accuracy	0.7575757575757576
Precision	0.7777777777777778
Recall	0.6774193548387096
False Positive Rate	0.2222222222222222
True Positive Rate	0.7435897435897436
	
Confusion Matrix
['alpha' 'beta']
[[58 12]
 [20 42]]
Sample predictions: 


Unnamed: 0,index,structureChainId,alpha,beta,coil,foldType,features,indexedLabel,predictions
0,1524,3VJF.A,0.949495,0.0,0.050505,alpha,"[0.0035189819881821624, 0.06227715010524239, 0...",0,alpha
1,9972,4ZP0.A,0.844388,0.0,0.155612,alpha,"[-0.24062376186404083, -0.20206979960155536, 0...",0,alpha
2,1607,1ZB1.B,0.689373,0.024523,0.286104,alpha,"[0.25934675793923306, 0.20951197242788266, 0.1...",0,alpha
3,4437,3O0L.A,0.027523,0.605505,0.366972,beta,"[0.11011362290597176, 0.08628048640382183, 0.4...",1,beta
4,7374,1V8H.B,0.0,0.528302,0.471698,beta,"[0.2136856085524174, -0.15646473964992558, 0.5...",1,alpha
5,3132,5INB.B,0.0,0.181818,0.818182,beta,"[0.37228168236712617, -0.008470005459255643, 0...",1,alpha
6,3714,2O62.A,0.04461,0.591078,0.364312,beta,"[0.040869305206317326, 0.14896627264547413, 0....",1,beta
7,8280,2ZS0.C,0.741497,0.0,0.258503,alpha,"[-0.3354835135205202, 0.06408879712019881, 0.4...",0,alpha
8,1441,1NC7.D,0.025641,0.641026,0.333333,beta,"[-0.024241065375343724, 0.2986784902720773, 0....",1,beta
9,124,4ZBW.A,0.63388,0.021858,0.344262,alpha,"[0.1162826621599456, -0.04502631703431092, -0....",0,alpha


## Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
mcc = mltoolkit.MultiClassClassifier(clf, 'foldType', testFraction=0.1)
matrics = mcc.fit(df)
for k,v in matrics.items(): print(f"{k}\t{v}")
    
print("Sample predictions: ")
mcc.prediction.head(10)


 Class	Train	Test

alpha	599	61

beta	589	71
Sample predictions: 

Total time taken: 0.07662343978881836
Methods	RandomForestClassifier
AUC	0.9258831678596167
F Score	0.861111111111111
Accuracy	0.8484848484848485
Precision	0.8493150684931506
Recall	0.8732394366197183
False Positive Rate	0.1506849315068493
True Positive Rate	0.847457627118644
	
Confusion Matrix
['alpha' 'beta']
[[50 11]
 [ 9 62]]
Sample predictions: 


Unnamed: 0,index,structureChainId,alpha,beta,coil,foldType,features,indexedLabel,predictions
0,4873,2WJR.A,0.0,0.70098,0.29902,beta,"[0.27214271129664264, 0.28306340715659895, 0.3...",1,beta
1,6768,1XL3.D,0.764706,0.0,0.235294,alpha,"[-0.06242364997064675, -0.11843158800543829, 0...",0,alpha
2,3716,2O8Q.A,0.040984,0.459016,0.5,beta,"[-0.02945921586868458, 0.0074145054878821046, ...",1,beta
3,4391,5EZU.B,0.0,0.686567,0.313433,beta,"[0.293177553685382, 0.21745463308285584, 0.147...",1,alpha
4,120,4Z9H.A,0.793939,0.0,0.206061,alpha,"[-0.06895020773204473, 0.42217526158604485, 0....",0,alpha
5,8055,4H5S.B,0.03,0.52,0.45,beta,"[0.48956452548503876, 0.44718771868385376, 0.0...",1,beta
6,7183,1X3K.A,0.756579,0.0,0.243421,alpha,"[0.34279883324803895, 0.04847837429468995, 0.3...",0,beta
7,10503,5AYQ.B,0.024793,0.545455,0.429752,beta,"[0.350287054033175, 0.3823989737493387, -0.019...",1,beta
8,5601,3ZDO.G,0.953125,0.0,0.046875,alpha,"[-0.24938593470457807, 0.1764082081885223, 0.2...",0,alpha
9,8944,5YAY.A,0.586066,0.008197,0.405738,alpha,"[-0.17084786460691248, -0.11424235563572438, 0...",0,alpha


## Logistic Regression Classifier

In [12]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
mcc = mltoolkit.MultiClassClassifier(clf, 'foldType', testFraction=0.1)
matrics = mcc.fit(df)
for k,v in matrics.items(): print(f"{k}\t{v}")
    
print("Sample predictions: ")
mcc.prediction.head(10)


 Class	Train	Test

alpha	597	63

beta	591	69
Sample predictions: 

Total time taken: 0.0392000675201416
Methods	LogisticRegression
AUC	0.9726247987117552
F Score	0.9428571428571428
Accuracy	0.9393939393939394
Precision	0.9295774647887324
Recall	0.9565217391304348
False Positive Rate	0.07042253521126761
True Positive Rate	0.9508196721311475
	
Confusion Matrix
['alpha' 'beta']
[[58  5]
 [ 3 66]]
Sample predictions: 


Unnamed: 0,index,structureChainId,alpha,beta,coil,foldType,features,indexedLabel,predictions
0,1476,1RFY.B,0.727273,0.0,0.272727,alpha,"[-0.04950529289441091, -0.1458300913656407, 0....",0,alpha
1,6689,2J73.B,0.0,0.578431,0.421569,beta,"[0.30379150622506057, -0.1085839520080709, 0.5...",1,beta
2,6376,4H4N.A,0.0,0.564516,0.435484,beta,"[0.720117015030348, 0.22107654584176614, 0.055...",1,beta
3,6420,4IL6.F,0.676471,0.0,0.323529,alpha,"[0.18515646096431848, -0.11571282509601477, 0....",0,alpha
4,1690,2DPF.D,0.0,0.441441,0.558559,beta,"[-0.07219405693671943, 0.21463602097508938, 0....",1,beta
5,7964,3NDZ.F,0.0,0.607477,0.392523,beta,"[0.21551894579293593, 0.249717125141958, 0.343...",1,beta
6,6361,4GEI.A,0.0,0.611111,0.388889,beta,"[0.27206953736209627, 0.049039513066461024, 0....",1,beta
7,2747,5F6L.B,0.022472,0.466292,0.511236,beta,"[0.2569615783302185, 0.0992076373099263, 0.393...",1,beta
8,9038,4YWA.D,0.024793,0.487603,0.487603,beta,"[-0.01494521099763612, -0.09798166562492648, 0...",1,beta
9,3445,1GWM.A,0.039216,0.503268,0.457516,beta,"[0.04370024243957902, 0.18374134100189332, 0.3...",1,beta


## Simple Multilayer Perception Classifier

In [19]:
from sklearn.neural_network import MLPClassifier

layers = [featureCount, 32, 32, classCount]
clf = MLPClassifier(solver='sgd', \
                    alpha=1e-5, \
                    hidden_layer_sizes = (32, 32))
mcc = mltoolkit.MultiClassClassifier(clf, 'foldType', testFraction=0.1)

matrics = mcc.fit(df)
for k,v in matrics.items(): print(f"{k}\t{v}")
    
print("Sample predictions: ")
mcc.prediction.head(10)


 Class	Train	Test

alpha	589	71

beta	599	61
Sample predictions: 

Total time taken: 1.3699915409088135
Methods	MLPClassifier
AUC	0.9254213807434772
F Score	0.8527131782945736
Accuracy	0.8560606060606061
Precision	0.8088235294117647
Recall	0.9016393442622951
False Positive Rate	0.19117647058823528
True Positive Rate	0.90625
	
Confusion Matrix
['alpha' 'beta']
[[58 13]
 [ 6 55]]
Sample predictions: 




Unnamed: 0,index,structureChainId,alpha,beta,coil,foldType,features,indexedLabel,predictions
0,10308,4N7W.A,0.840391,0.006515,0.153094,alpha,"[0.02174834064603631, -0.06910717640425158, 0....",0,beta
1,3953,4G5A.B,0.040404,0.535354,0.424242,beta,"[0.4698894739017955, 0.03291584538029772, 0.39...",1,beta
2,2777,2HYJ.A,0.777202,0.0,0.222798,alpha,"[-0.39445917652545487, -0.11504761239597995, 0...",0,alpha
3,951,5NUV.A,0.0,0.54485,0.45515,beta,"[0.015260229783749524, 0.205864536107288, 0.36...",1,beta
4,2918,4Y9V.A,0.043118,0.46932,0.487562,beta,"[0.23106373748100267, 0.14468181522739215, 0.2...",1,beta
5,8684,4BFO.A,0.028302,0.537736,0.433962,beta,"[0.4678057109848375, 0.2935785559139082, 0.062...",1,beta
6,2554,3C9P.A,0.631148,0.04918,0.319672,alpha,"[0.06929258233104207, -0.15727230191963618, 0....",0,alpha
7,8314,3A5P.A,0.029126,0.563107,0.407767,beta,"[0.025798276301871224, 0.05934039329730191, 0....",1,beta
8,9080,5UOU.A,0.653333,0.013333,0.333333,alpha,"[-0.5292880434571117, -0.06241236063228412, 0....",0,alpha
9,2919,5T77.A,0.835118,0.0,0.164882,alpha,"[0.46545253950171167, 0.05344050549725188, 0.1...",0,beta


## Terminate Spark

In [20]:
spark.stop()