# Analyze the features

In [1]:
import pandas as pd
path = ''
df = pd.read_csv(f'{path}train_features_analysis.csv')

In [2]:
df.head()

Unnamed: 0,TCR,epitope,binding,TCR_AF1,TCR_AF2,TCR_AF3,TCR_AF4,TCR_AF5,TCR_BLOSUM1,TCR_BLOSUM2,...,TCR_molecular_weight,TCR_hydrophobicity,TCR_aromaticity,TCR_isoelectric_point,TCR_instability_index,epitope_molecular_weight,epitope_hydrophobicity,epitope_aromaticity,epitope_isoelectric_point,epitope_instability_index
0,"tensor([[[ 0.0430, 0.1207, -0.0356, ..., -0....","tensor([[[ 0.1032, 0.1382, 0.0792, ..., -0....",1,-0.076129,0.297741,-0.776977,-0.069318,-0.651857,0.145714,0.256429,...,1640.6832,-0.65,0.285714,4.050028,37.764286,885.0627,-0.1,0.0,10.002737,39.955556
1,"tensor([[[ 0.0326, 0.1168, 0.0275, ..., -0....","tensor([[[ 1.2764e-01, 2.0162e-01, 4.4363e-0...",1,-0.063929,0.489718,-0.399876,0.035779,-0.176357,0.203846,0.253846,...,1445.5112,-0.746154,0.230769,5.239896,18.515385,1059.2783,0.922222,0.222222,8.59088,71.901111
2,"tensor([[[ 0.0817, 0.1058, 0.0129, ..., -0....","tensor([[[ 0.1032, 0.1382, 0.0792, ..., -0....",1,-0.160409,0.424326,-0.480501,0.325677,-0.269915,0.094286,0.04,...,1484.5423,-0.271429,0.214286,4.050028,13.9,885.0627,-0.1,0.0,10.002737,39.955556
3,"tensor([[[ 0.0789, 0.1837, 0.0413, ..., -0....","tensor([[[ 0.0269, 0.1638, 0.0375, ..., -0....",1,0.260402,-0.120213,-0.19196,0.167132,0.015942,0.317692,-0.027692,...,1627.7823,-1.092308,0.153846,8.24952,109.623077,966.1732,2.266667,0.222222,5.525,20.855556
4,"tensor([[[ 0.1056, 0.1566, 0.0287, ..., -0....","tensor([[[ 0.0401, 0.1326, 0.0532, ..., -0....",1,-0.486168,0.320122,-0.851153,0.615578,-0.419468,-0.069333,-0.258667,...,1484.6715,0.84,0.133333,4.050028,67.506667,1293.3839,-0.918182,0.181818,6.743628,12.890909


In [3]:
df = df.drop(columns=['TCR', 'epitope'])

## Random Forest and feature importance

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = df.drop('binding', axis=1)
y = df['binding']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestClassifier(n_estimators = 200, random_state=33)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9625


In [5]:
# feature importance
importances = rf.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': X.columns, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df[:20])

                      Feature  Gini Importance
116          epitope_BLOSUM10         0.022946
185             epitope_VHSE1         0.017300
198            epitope_VSTPV6         0.016901
126               epitope_KF1         0.016291
199                epitope_Z1         0.015695
196            epitope_VSTPV4         0.014535
166               epitope_ST6         0.013963
132               epitope_KF7         0.013776
187             epitope_VHSE3         0.012942
197            epitope_VSTPV5         0.012822
151           epitope_ProtFP3         0.012822
170            epitope_SVGER2         0.012758
155           epitope_ProtFP7         0.012015
209  epitope_molecular_weight         0.011881
143                epitope_E5         0.011046
110           epitope_BLOSUM4         0.010499
195            epitope_VSTPV3         0.009942
159               epitope_SV3         0.009516
186             epitope_VHSE2         0.009492
175            epitope_SVGER7         0.009476


High Gini Importance: Indicates that the feature is significant in predicting the target variable. Features with higher Gini importance contribute more to model accuracy.

`epitope_BLOSUM10`, `epitope_VHSE1`, `epitope_VSTPV6`, `epitope_KF1`, `epitope_Z1`,  are among the most important based on Gini importance.

## Variance Analysis

In [6]:
variances = df.var()
var_df = variances.reset_index()
var_df.columns = ['descriptors', 'variance']

# Filter out descriptors with low variance
var_df = var_df.query('variance > 1') 
print(var_df.sort_values(by=['variance'], ascending=False))

                   descriptors      variance
205       TCR_molecular_weight  47308.890621
210   epitope_molecular_weight  18359.386069
214  epitope_instability_index   1453.671757
209      TCR_instability_index    652.029594
213  epitope_isoelectric_point      6.171623
147              epitope_PRIN1      5.930544
176             epitope_SVGER7      4.795772
148              epitope_PRIN2      4.440577
179            epitope_SVGER10      3.845924
208      TCR_isoelectric_point      3.024753
77                 TCR_SVGER10      2.572540
150            epitope_ProtFP1      2.461494
170             epitope_SVGER1      2.420549
74                  TCR_SVGER7      2.391468
151            epitope_ProtFP2      2.252743
181                 epitope_T1      1.877131
45                   TCR_PRIN1      1.793627
46                   TCR_PRIN2      1.541911
68                  TCR_SVGER1      1.407774


High Variance: Indicates that the feature has a wide range of values and can be more informative.

`TCR_molecular_weight`, `epitope_molecular_weight`, `epitope_instability_index` and `TCR_instability_index` are high in variance, indicating that these features may carry significant information

## Variance Inflation Factor (VIF)

In [7]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X = df.drop(columns=['binding'])

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data.sort_values(by=['VIF'])[:20])

  vif = 1. / (1. - r_squared_i)


                       feature        VIF
208      TCR_instability_index   1.872116
213  epitope_instability_index   2.449817
204       TCR_molecular_weight   3.351866
209   epitope_molecular_weight   4.070197
207      TCR_isoelectric_point   9.126968
212  epitope_isoelectric_point  22.399298
206            TCR_aromaticity        inf
134                epitope_KF9        inf
135               epitope_KF10        inf
136            epitope_MSWHIM1        inf
137            epitope_MSWHIM2        inf
138            epitope_MSWHIM3        inf
139                 epitope_E1        inf
140                 epitope_E2        inf
141                 epitope_E3        inf
142                 epitope_E4        inf
143                 epitope_E5        inf
144                epitope_PD1        inf
145                epitope_PD2        inf
146              epitope_PRIN1        inf


- VIF < 5: Generally considered acceptable.
- VIF between 5–10: Indicates moderate multicollinearity.
- VIF > 10: Suggests high multicollinearity; these features should be considered for removal or adjustment.

Just `TCR_instability_index`, `epitope_instability_index`, `TCR_molecular_weight`, `epitope_molecular_weight`, `TCR_isoelectric_point` and `epitope_isoelectric_point` have a VIF value the others has an infinit value. Maybe we need to drop some of the features to reduce mutlicollinearity.


## VIF with a high variance features

In [8]:
X = df[var_df['descriptors']]

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data.sort_values(by=['VIF']))

                      feature         VIF
18  epitope_instability_index    1.785308
15      TCR_instability_index    2.800042
0                   TCR_PRIN1    4.472529
1                   TCR_PRIN2   14.986066
14      TCR_isoelectric_point   22.257625
17  epitope_isoelectric_point   27.688107
13       TCR_molecular_weight   62.760359
3                  TCR_SVGER7   65.545048
7             epitope_ProtFP1   75.574871
11            epitope_SVGER10   86.481374
10             epitope_SVGER7   88.293459
5               epitope_PRIN1   99.598936
4                 TCR_SVGER10  126.014853
16   epitope_molecular_weight  145.370965
9              epitope_SVGER1  158.178729
2                  TCR_SVGER1  254.096190
8             epitope_ProtFP2  289.717506
6               epitope_PRIN2  333.367345
12                 epitope_T1  410.318766


Only `epitope_instability_index`, `TCR_instability_index` and `TCR_PRIN1` has a smaller value than 10

## VIF with the Physicochemical features from BioPython

In [9]:
X = df[['TCR_molecular_weight', 'epitope_molecular_weight', 'TCR_hydrophobicity', 'epitope_hydrophobicity', 'TCR_aromaticity', 'epitope_aromaticity', 'TCR_isoelectric_point', 'epitope_isoelectric_point', 'TCR_instability_index', 'epitope_instability_index']]

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data.sort_values(by=['VIF']))

                     feature        VIF
2         TCR_hydrophobicity   1.214408
3     epitope_hydrophobicity   1.555622
9  epitope_instability_index   1.555990
8      TCR_instability_index   2.251413
5        epitope_aromaticity   3.385429
4            TCR_aromaticity   7.583196
7  epitope_isoelectric_point  10.914611
6      TCR_isoelectric_point  11.701577
0       TCR_molecular_weight  42.279470
1   epitope_molecular_weight  50.986218


They all have a low vif value except `TCR_molecular_weight` and `epitope_molecular_weight`, but these have a very high variance value. However, the weight seems to be dependent on the other features. So we will drop them in favor to have a more stable model.

## VIF with the Random Forest Feature importance

In [10]:
X = df[feature_imp_df[:10]['Feature']]

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data.sort_values(by=['VIF']))

            feature         VIF
3       epitope_KF1    1.653683
7       epitope_KF7    1.753344
0  epitope_BLOSUM10    2.519118
9    epitope_VSTPV5    3.135549
5    epitope_VSTPV4    6.211116
6       epitope_ST6   18.423876
8     epitope_VHSE3   19.364298
2    epitope_VSTPV6   21.007194
1     epitope_VHSE1   76.965062
4        epitope_Z1  107.865807


## VIF with BioPython & Random Forest Feature importance

In [13]:
X = df[['epitope_KF7', 'TCR_KF7', 'epitope_KF1', 'TCR_KF1','TCR_hydrophobicity', 'TCR_aromaticity',
       'TCR_isoelectric_point', 'TCR_instability_index','epitope_hydrophobicity','epitope_aromaticity',
        'epitope_isoelectric_point','epitope_instability_index']]

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data.sort_values(by=['VIF']))

                      feature       VIF
4          TCR_hydrophobicity  1.293533
0                 epitope_KF7  1.333976
3                     TCR_KF1  1.368396
1                     TCR_KF7  1.455007
11  epitope_instability_index  1.531864
2                 epitope_KF1  1.720917
8      epitope_hydrophobicity  1.803038
7       TCR_instability_index  2.286762
9         epitope_aromaticity  2.616315
5             TCR_aromaticity  6.149341
6       TCR_isoelectric_point  9.049608
10  epitope_isoelectric_point  9.635145


Good all VIF values are below 10

## Retrain Random Forest with reduced features

In [12]:
X = df[['epitope_KF7', 'TCR_KF7', 'epitope_KF1', 'TCR_KF1','TCR_hydrophobicity', 'TCR_aromaticity',
       'TCR_isoelectric_point', 'TCR_instability_index','epitope_hydrophobicity','epitope_aromaticity',
        'epitope_isoelectric_point','epitope_instability_index']]
y = df['binding']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestClassifier(n_estimators = 200, random_state=33)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# feature importance
importances = rf.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': X.columns, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df[:20])

Accuracy: 0.95
                      Feature  Gini Importance
2                 epitope_KF1         0.191727
0                 epitope_KF7         0.179264
8      epitope_hydrophobicity         0.111241
11  epitope_instability_index         0.108909
10  epitope_isoelectric_point         0.100901
4          TCR_hydrophobicity         0.051738
9         epitope_aromaticity         0.048744
1                     TCR_KF7         0.046407
7       TCR_instability_index         0.045073
3                     TCR_KF1         0.042681
6       TCR_isoelectric_point         0.040255
5             TCR_aromaticity         0.033060


The Accuracy is just slighty worse, but can considered as the same

# Conclusion

We will take the following physicochemical features:

- `TCR_KF7`
- `TCR_KF1`
- `TCR_hydrophobicity`
- `TCR_aromaticity`
- `TCR_isoelectric_point`
- `TCR_instability_index`
- `epitope_KF7`
- `epitope_KF1`
- `epitope_hydrophobicity`
- `epitope_aromaticity`
- `epitope_isoelectric_point`
- `epitope_instability_index`