In [2]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

train_path = '/content/drive/MyDrive/train.csv'
test_path = '/content/drive/MyDrive/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Training Data Info:")
print(train_df.info())
print("\nTraining Data Description:")
print(train_df.describe())

print("\nTesting Data Info:")
print(test_df.info())

print("\nFirst 5 rows of Training Data:")
print(train_df.head())
print("\nFirst 5 rows of Testing Data:")
print(test_df.head())
#sum of samples that are not Nan/null in training data
print(train_df.notna().sum())
#sum of samples that are  Nan/null in test data
print(test_df.isna().sum())
#sum of samples that are  Nan/null in training data
print(train_df.isna().sum())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8148 entries, 0 to 8147
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        8148 non-null   int64  
 1   sequence  8148 non-null   object 
 2   target    8148 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 191.1+ KB
None

Training Data Description:
                 id       target
count   8148.000000  8148.000000
mean    5033.456554    52.302745
std     2924.877222     4.738780
min        0.000000    38.520141
25%     2500.750000    48.608375
50%     5020.500000    51.620779
75%     7555.750000    55.496006
max    10091.000000    69.875198

Testing Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1945 entries, 0 to 1944
Data columns (total 2 columns):
 #   Column    Non-Null Count 

In [3]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


vocab = "ARNDCQEGHILKMFPSTWYVU"

def one_hot_encode_seq(seq, vocab=vocab, max_length=None):
    """ One-hot encode a protein sequence, padding to max_length. """
    if max_length is None:
        max_length = len(seq)
    encoding = np.zeros((max_length, len(vocab)), dtype=int)
    aa_to_index = {aa: idx for idx, aa in enumerate(vocab)}
    for i, aa in enumerate(seq[:max_length]):  # Slice sequence if longer than max_length
        if aa in aa_to_index:
            encoding[i, aa_to_index[aa]] = 1
    return encoding.flatten()

def frequency_encode_seq(seq, vocab=vocab):
    """ Calculate frequency of each amino acid in a sequence. """
    encoding = np.zeros(len(vocab), dtype=float)
    aa_to_index = {aa: idx for idx, aa in enumerate(vocab)}
    seq_length = len(seq)
    for aa in seq:
        if aa in aa_to_index:
            encoding[aa_to_index[aa]] += 1
    if seq_length > 0:
        encoding /= seq_length
    return encoding

def encode_dataset(df, encoding_func, max_length=None):
    """ Apply encoding function to the 'sequence' column of a dataframe. """
    if 'max_length' in encoding_func.__code__.co_varnames:
        encoded_features = np.array([encoding_func(seq, vocab, max_length) for seq in df['sequence']])
    else:
        encoded_features = np.array([encoding_func(seq, vocab) for seq in df['sequence']])
    return encoded_features

max_length = max(train_df['sequence'].str.len().max(), test_df['sequence'].str.len().max())

train_features_one_hot = encode_dataset(train_df, one_hot_encode_seq, max_length)
test_features_one_hot = encode_dataset(test_df, one_hot_encode_seq, max_length)

train_features_freq = encode_dataset(train_df, frequency_encode_seq)
test_features_freq = encode_dataset(test_df, frequency_encode_seq)

train_targets = train_df['target'].values

In [8]:
model_one_hot = LinearRegression()
model_one_hot.fit(train_features_one_hot, train_targets)
train_predictions_one_hot = model_one_hot.predict(train_features_one_hot)

test_predictions_one_hot = model_one_hot.predict(test_features_one_hot)

model_freq = LinearRegression()
model_freq.fit(train_features_freq, train_targets)
train_predictions_freq = model_freq.predict(train_features_freq)

rmse_one_hot = np.sqrt(mean_squared_error(train_targets, train_predictions_one_hot))

rmse_freq = np.sqrt(mean_squared_error(train_targets, train_predictions_freq))

print("RMSE with One-Hot Encoding:", rmse_one_hot)
print("RMSE with Frequency Encoding:", rmse_freq)

RMSE with One-Hot Encoding: 0.5663342573821032
RMSE with Frequency Encoding: 4.34529999116271


In [10]:
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions_one_hot
})


submission_df.to_csv('/content/drive/MyDrive/linear_regression_prediction.csv', index=False)

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(train_features_one_hot, train_targets)
train_predictions_rf = model_rf.predict(train_features_one_hot)
test_predictions_rf = model_rf.predict(test_features_one_hot)

rmse_rf = np.sqrt(mean_squared_error(train_targets, train_predictions_rf))
print("Training RMSE with RandomForest:", rmse_rf)
submission_df_rf = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions_rf
})
submission_df_rf.to_csv('/content/drive/MyDrive/withoutPCA_prediction_rf.csv', index=False)
print("RandomForest CSV file has been created and saved.")


KeyboardInterrupt: 

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA


pca = PCA(n_components=200)
train_features_pca = pca.fit_transform(train_features_one_hot)
test_features_pca = pca.transform(test_features_one_hot)

model_rf = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42, n_jobs=-1)
model_rf.fit(train_features_pca, train_targets)

train_predictions_rf = model_rf.predict(train_features_pca)
rmse_rf = np.sqrt(mean_squared_error(train_targets, train_predictions_rf))
print("Training RMSE with RandomForest:", rmse_rf)

test_predictions_rf = model_rf.predict(test_features_pca)
submission_df_rf = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions_rf
})
submission_df_rf.to_csv('/content/drive/MyDrive/estimators50_PCAprediction_rf.csv', index=False)
print("RandomForest CSV file has been created and saved.")

Training RMSE with RandomForest: 3.9445961902588302
RandomForest CSV file has been created and saved.


In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA


pca = PCA(n_components=200)
train_features_pca = pca.fit_transform(train_features_one_hot)
test_features_pca = pca.transform(test_features_one_hot)

model_rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
model_rf.fit(train_features_pca, train_targets)

train_predictions_rf = model_rf.predict(train_features_pca)
rmse_rf = np.sqrt(mean_squared_error(train_targets, train_predictions_rf))
print("Training RMSE with RandomForest:", rmse_rf)

test_predictions_rf = model_rf.predict(test_features_pca)
submission_df_rf = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions_rf
})
submission_df_rf.to_csv('/content/drive/MyDrive/estimators100_PCAprediction_rf.csv', index=False)
print("RandomForest CSV file has been created and saved.")

Training RMSE with RandomForest: 2.7864261661655534
RandomForest CSV file has been created and saved.


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

model_gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

model_gb.fit(train_features_one_hot, train_targets)

train_predictions_gb = model_gb.predict(train_features_one_hot)

test_predictions_gb = model_gb.predict(test_features_one_hot)

rmse_gb = np.sqrt(mean_squared_error(train_targets, train_predictions_gb))
print("Training RMSE with Gradient Boosting:", rmse_gb)

submission_df_gb = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions_gb
})

submission_df_gb.to_csv('/content/drive/MyDrive/prediction_gb.csv', index=False)
print("Gradient Boosting CSV file has been created and saved to '/content/drive/MyDrive/prediction_gb.csv'")


In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features_one_hot)
test_features_scaled = scaler.transform(test_features_one_hot)

model = Sequential([
    Dense(128, activation='relu', input_shape=(train_features_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  #no activation function
])

model.compile(optimizer='adam', loss='mean_squared_error')

history = model.fit(train_features_scaled, train_targets, epochs=50, batch_size=32, validation_split=0.2)

train_predictions_nn = model.predict(train_features_scaled)
rmse_nn = np.sqrt(mean_squared_error(train_targets, train_predictions_nn))
print("Training RMSE with Neural Network:", rmse_nn)

test_predictions_nn = model.predict(test_features_scaled)

submission_df_nn = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions_nn.flatten()  #convert 2D array to 1D
})
submission_df_nn.to_csv('/content/drive/MyDrive/50epoch_prediction_nn.csv', index=False)
print("Neural Network CSV file has been created and saved.")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training RMSE with Neural Network: 16.680073504236123
Neural Network CSV file has been created and saved.


In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features_one_hot)
test_features_scaled = scaler.transform(test_features_one_hot)


model = Sequential([
    Dense(128, activation='relu', input_shape=(train_features_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(train_features_scaled, train_targets, epochs=300, batch_size=32, validation_split=0.2)
train_predictions_nn = model.predict(train_features_scaled)
rmse_nn = np.sqrt(mean_squared_error(train_targets, train_predictions_nn))
print("Training RMSE with Neural Network:", rmse_nn)

test_predictions_nn = model.predict(test_features_scaled)

submission_df_nn = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions_nn.flatten()
})
submission_df_nn.to_csv('/content/drive/MyDrive/300epoch_prediction_nn.csv', index=False)
print("Neural Network CSV file has been created and saved.")

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features_one_hot)
test_features_scaled = scaler.transform(test_features_one_hot)

model = Sequential([
    Dense(128, activation='relu', input_shape=(train_features_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])


model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(train_features_scaled, train_targets, epochs=125, batch_size=32, validation_split=0.2)
train_predictions_nn = model.predict(train_features_scaled)
rmse_nn = np.sqrt(mean_squared_error(train_targets, train_predictions_nn))
print("Training RMSE with Neural Network:", rmse_nn)
test_predictions_nn = model.predict(test_features_scaled)

submission_df_nn = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions_nn.flatten()
})
submission_df_nn.to_csv('/content/drive/MyDrive/125_epoch_prediction_nn.csv', index=False)
print("Neural Network CSV file has been created and saved.")

Epoch 1/125
Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125
Epoch 70/125
Epoch 71/125
Epoch 72/125
Epoch 73/125
Epoch 74/125
Epoch 75/125
Epoch 76/125
Epoch 77/125
Epoch 78