In [1]:
import numpy as np
import pandas as pd

In [2]:
original = pd.read_csv("../novozymes-enzyme-stability-prediction/train.csv")
original.describe()

Unnamed: 0,seq_id,pH,tm
count,31390.0,31104.0,31390.0
mean,15694.5,6.892339,49.147337
std,9061.656811,1.612225,14.010089
min,0.0,1.99,-1.0
25%,7847.25,7.0,42.1
50%,15694.5,7.0,48.0
75%,23541.75,7.0,53.8
max,31389.0,64.9,130.0


In [3]:
original

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5
...,...,...,...,...,...
31385,31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,doi.org/10.1038/s41592-020-0801-4,51.8
31386,31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,doi.org/10.1038/s41592-020-0801-4,37.2
31387,31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,doi.org/10.1038/s41592-020-0801-4,64.6
31388,31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.7


In [4]:
original.isnull().sum(axis=0)

seq_id                 0
protein_sequence       0
pH                   286
data_source         3347
tm                     0
dtype: int64

In [5]:
# Will take 3-5 seconds to run
def load_fixed_train_df(original_train_file_path="../novozymes-enzyme-stability-prediction/train.csv",
                        update_file_path="../novozymes-enzyme-stability-prediction/train_updates_20220929.csv",
                        was_fixed_col=False):
    def _fix_tm_ph(_row, update_map):
        update_vals = update_map.get(_row["seq_id"], None)
        if update_vals is not None:
            _row["tm"] = update_vals["tm"]
            _row["pH"] = update_vals["pH"]
        return _row

    # Load dataframes
    _df = pd.read_csv(original_train_file_path)
    _updates_df = pd.read_csv(update_file_path)

    # Identify which sequence ids need to have the tm and pH values changed and create a dictionary mapping 
    seqid_2_phtm_update_map = _updates_df[~pd.isna(_updates_df["pH"])].groupby("seq_id")[["pH", "tm"]].first().to_dict("index")

    # Identify the sequence ids that will be dropped due to data quality issues
    bad_seqids = _updates_df[pd.isna(_updates_df["pH"])]["seq_id"].to_list()

    # Fix bad sequence ids
    _df = _df[~_df["seq_id"].isin(bad_seqids)].reset_index(drop=True)

    # Fix pH and tm swaparoo
    _df = _df.apply(lambda x: _fix_tm_ph(x, seqid_2_phtm_update_map), axis=1)

    # Add in a bool to track if a row was fixed or not (tm/ph swap will look the same as bad data)
    if was_fixed_col: _df["was_fixed"] = _df["seq_id"].isin(bad_seqids+list(seqid_2_phtm_update_map.keys()))

    return _df


protein_dataset = load_fixed_train_df()
protein_dataset

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5
...,...,...,...,...,...
28976,31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,doi.org/10.1038/s41592-020-0801-4,51.8
28977,31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,doi.org/10.1038/s41592-020-0801-4,37.2
28978,31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,doi.org/10.1038/s41592-020-0801-4,64.6
28979,31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.7


In [6]:
protein_dataset.isnull().sum(axis=0)

seq_id                0
protein_sequence      0
pH                  286
data_source         980
tm                    0
dtype: int64

In [7]:
protein_dataset['protein_sequence_length'] = protein_dataset['protein_sequence'].str.len()
protein_dataset

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm,protein_sequence_length
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7,341
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5,286
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5,497
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2,265
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5,1451
...,...,...,...,...,...,...
28976,31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,doi.org/10.1038/s41592-020-0801-4,51.8,549
28977,31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,doi.org/10.1038/s41592-020-0801-4,37.2,469
28978,31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,doi.org/10.1038/s41592-020-0801-4,64.6,128
28979,31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.7,593


In [8]:
protein_dataset = protein_dataset.drop(columns=['seq_id', 'data_source'], errors='ignore')
protein_dataset

Unnamed: 0,protein_sequence,pH,tm,protein_sequence_length
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,341
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,286
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,497
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,265
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5,1451
...,...,...,...,...
28976,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8,549
28977,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2,469
28978,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6,128
28979,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7,593


In [9]:
protein_dataset.describe()

Unnamed: 0,pH,tm,protein_sequence_length
count,28695.0,28981.0,28981.0
mean,6.872467,51.360005,450.468617
std,0.793184,12.056717,415.159049
min,1.99,25.1,5.0
25%,7.0,43.6,212.0
50%,7.0,48.8,351.0
75%,7.0,54.6,537.0
max,11.0,130.0,8798.0


In [10]:
from sklearn.model_selection import train_test_split
feature_columns = ["protein_sequence"]
target_columns = "tm"
X_train, X_test, y_train, y_test = train_test_split(protein_dataset[feature_columns], protein_dataset[target_columns], test_size=0.2, random_state=42)

In [13]:
from gensim import utils

class MyDataset:
    # creatign a wrapper to process a sentence
    def __iter__(self):
        for line in X_train['protein_sequence'].values.tolist():
            yield [*line]

In [14]:
import gensim.models
protein_seqs = MyDataset()
# training word2vec for our own dataset
gensim_model = gensim.models.Word2Vec(sentences=protein_seqs, vector_size=20, sg=0)

In [15]:
def getAverageVectors(sentence):
    tokens = [*sentence] # split sentence to tokens on space
    result = np.zeros(20) # start with zero result

    for token in tokens:
        # try except block helps in ignoring words that are not present in word2Vec model
        try: 
            result = np.add(result, gensim_model.wv[token])
        except:
            pass
    # find avgerage of the result
    result = result / len(tokens)
    return ','.join(map(str, result))

def getNVectors(sentence, n=10000):
    tokens = [*sentence] # split sentence to tokens on space
    result = [] # result is initialized to an empty list
    count = 0
    for token in tokens:
        # try except block helps in ignoring words that are not present in word2Vec model
        try:
            word_vec = np.array(gensim_model.wv[token], np.float32) # caculate word2vec for each token
            result.append(word_vec)
            count += 1
        except:
            pass
        if count == n:
            break
    # check if the result list is less than the limit, if yes append zero vectors till limit is reached
    while len(result) < n:
        result.append(np.zeros((20, ), dtype=np.float32))
    return result

In [16]:
gensim_xtrain = X_train.copy()
gensim_xtrain['protein_sequence'] = gensim_xtrain['protein_sequence'].apply(lambda x: getAverageVectors(x))
gensim_xtrain = gensim_xtrain['protein_sequence'].str.split(',', -1, expand=True)
gensim_xtrain

  gensim_xtrain = gensim_xtrain['protein_sequence'].str.split(',', -1, expand=True)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
5653,-0.2696378109053717,0.09805583658519092,0.01150413222510267,-0.005185535420906973,-0.03345716650690044,0.10557636383895737,0.21592983309817207,0.06311751974679462,-0.14773012429207294,-0.06357832227336443,0.18605571233474458,-0.05756680271378509,0.011874853159292592,0.015977486070279066,0.005246550146792386,0.09122984325261535,0.20528567816038523,0.09980698725258982,-0.14310468536977833,0.1500183685540079
15912,-0.14775125989142587,0.1575423295468369,-0.07776257007068525,0.05058268586825291,0.013512459184236294,0.055842117360294236,0.16910413904086724,0.16264204074839136,-0.0734168261105062,-0.022972932953283696,0.22838654908879252,-0.1771580171151427,0.13423394271671402,0.052700176119988916,-0.13635066039744914,0.18447946303815296,0.25645581048108845,0.256197237054999,-0.10772624283152468,0.12331791106761425
20717,-0.100396723475208,0.17050315607925448,-0.14236801088343376,0.10839346351584568,0.039826411973757185,-0.003442858143982457,0.18992380631893657,0.1554899314513146,-0.17298596465991714,-0.008787920776321654,0.22760326362983943,-0.17773619446605804,0.13099885267852726,0.09913970843931665,-0.2211279303881793,0.14096143205828296,0.1966210352730448,0.2667332783738291,-0.0960298607951215,-0.016968561250707427
10385,-0.2141049335484451,0.13690954171342112,0.03233160891985826,-0.0636696484739666,-0.042047841982168835,0.12707411854838332,0.26142378332419586,0.05510667141127048,-0.22657045835660675,-0.06278570632037238,0.25530288682061403,-0.048122306243847994,0.03950785547179186,0.03096301843659521,0.15416942284268847,0.02137046017931343,0.08397062318031229,0.08058495179744764,-0.11890744683095965,0.1444154491195571
17694,-0.24132379472746954,0.062210773527212854,-0.04168340375053359,0.055440494827516784,0.032485496424711664,0.02388528157127919,0.1847283885210425,0.15832598025163452,-0.09565261251978822,-0.0007498761398492607,0.1600160776586323,-0.11967992545156689,-0.0021384929030478657,0.1097495265264105,-0.12896274546427386,0.19805967946956446,0.25109626045348227,0.2728780815234551,-0.0849179530782359,0.1086316303237454
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,-0.23437280752337897,0.15575256863752238,0.04103358472769077,-0.08659210338042332,0.033203579015456716,-0.15758765261333724,0.2932122719517121,0.20355091398152023,-0.24121321563537304,-0.024440353146682563,0.3514813353465154,-0.15684913635253905,0.06311643057717727,0.1487101139070896,0.16482012445536945,0.04798493842665966,0.03215716024430899,0.1638949554700118,-0.21482106689077157,0.1619028696647057
5390,-0.20508295442370406,0.11003836010111473,-0.03639796283657113,-0.011595844414578864,-0.03056717347858631,0.09300365150171186,0.241751955328397,0.08194486763193665,-0.1200049661818039,-0.04987766481206629,0.23001671266035142,-0.10868848404701216,0.04115741333576391,0.08957091000222836,0.001133469578596842,0.1649938427270895,0.17447444105363755,0.198132973209203,-0.06726004862148001,0.15318220889711953
860,-0.14693176396483298,0.06872755205441339,-0.09171586833602073,0.0582526473876308,0.0497778360631463,-0.03400674020214116,0.22203013442602812,0.12967240124867827,-0.18760071382218715,-0.031571339851007886,0.2170501963180654,-0.14207553501222647,0.0514813990715672,0.14819480316603886,-0.18802634601426474,0.1674069447187232,0.15644781475558,0.29272664458143943,-0.08533082998850766,-0.012434752080954757
15795,-0.13032351489296687,0.10853697477411249,-0.08084466676295538,-0.001005686039453391,0.025343836176189335,-0.23035588161967704,0.27029595685932967,0.14682885876169463,-0.22152105913250336,0.013841349572883991,0.2943427073711911,-0.22275424087034226,0.07326730597484753,0.20591244005993392,-0.09663828909981158,0.19429187505002252,0.09249013849854269,0.35310034941118484,-0.11043472633116189,0.018590344860732622


In [17]:
from sklearn.impute import SimpleImputer

imputer1 = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer1.fit(gensim_xtrain)
new_train = pd.DataFrame(imputer1.transform(gensim_xtrain), columns=gensim_xtrain.columns)

In [18]:
gensim_xtrain.isnull().values.any()

False

In [19]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(gensim_xtrain, y_train)
reg.score(gensim_xtrain, y_train)

0.24578621208216245