In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler

# Uploading data
data_path = r"/content/lp_data_he_for_model.tsv"
df = pd.read_csv(data_path, sep='\t')

# Drop rows where hurst_exponent is NA
df = df.dropna(subset=['hurst_exponent'])
print("Number of rows after dropping NA:", len(df))

# Drop rows where feature values are 0.0
df = df[df['sequence_length'] != 0]
print("Number of rows after dropping proteins that have zero values for their properties:", len(df))

# Drop rows with very low Hurst exponent
df = df[df['hurst_exponent'] > 0.1]
print("Number of rows after dropping low HE:", len(df))

# Separating the features (x) from the output: hurst_exponent (y)
x = df.iloc[:, 1:]  # Features: all columns except the first
y = df.iloc[:, 0]   # Output: the first column (hurst_exponent)

# Data preparation for training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Standardizing the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)



Number of rows after dropping NA: 9264
Number of rows after dropping proteins that have zero values for their properties: 2992
Number of rows after dropping low HE: 2146


In [None]:
# Decision Tree Regressor with Hyperparameter Tuning
dt_param_grid = {
    'max_depth': [None, 5, 10, 14, 20],
    'min_samples_split': [2, 5, 10]
}
dt_grid_search = GridSearchCV(DecisionTreeRegressor(), dt_param_grid, cv=5, scoring='r2')
dt_grid_search.fit(x_train_scaled, y_train)
best_dt = dt_grid_search.best_estimator_
y_pred_dt = best_dt.predict(x_test_scaled)
print("Decision Tree:")
print("Best Params:", dt_grid_search.best_params_)
print("R^2 Score:", r2_score(y_test, y_pred_dt))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_dt))

# Random Forest Regressor with Hyperparameter Tuning
rf_param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(RandomForestRegressor(), rf_param_grid, cv=5, scoring='r2')
rf_grid_search.fit(x_train_scaled, y_train)
best_rf = rf_grid_search.best_estimator_
y_pred_rf = best_rf.predict(x_test_scaled)
print("Random Forest:")
print("Best Params:", rf_grid_search.best_params_)
print("R^2 Score:", r2_score(y_test, y_pred_rf))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))

# Gradient Boosting Regressor with Hyperparameter Tuning
gb_param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}
gb_grid_search = GridSearchCV(GradientBoostingRegressor(), gb_param_grid, cv=5, scoring='r2')
gb_grid_search.fit(x_train_scaled, y_train)
best_gb = gb_grid_search.best_estimator_
y_pred_gb = best_gb.predict(x_test_scaled)
print("Gradient Boosting:")
print("Best Params:", gb_grid_search.best_params_)
print("R^2 Score:", r2_score(y_test, y_pred_gb))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_gb))

Decision Tree:
Best Params: {'max_depth': 5, 'min_samples_split': 10}
R^2 Score: -0.06184912088364025
Mean Squared Error: 0.011431941907096763
Random Forest:
Best Params: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
R^2 Score: 0.00031379681790022484
Mean Squared Error: 0.010762691587100009


KeyboardInterrupt: 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.010542274323116834
R^2 Score: 0.020787123212501424


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load and clean the data
file_path = '/content/lp_data_he_for_model.tsv'  # Update with the correct path
data = pd.read_csv(file_path, sep='\t')
data_cleaned = data[(data != 0).all(axis=1)]
data_cleaned = data_cleaned.dropna(subset=['hurst_exponent'])

# Split data into features and target
X = data_cleaned.drop(columns=['hurst_exponent'])
y = data_cleaned['hurst_exponent']

# Optionally, scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters from GridSearchCV
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R^2 Score: {r2:.4f}")

# Optional: Cross-validation score for further reliability
cv_scores = cross_val_score(best_model, X_scaled, y, cv=5, scoring='r2')
print("Cross-validated R^2 Scores:", cv_scores)
print("Average Cross-validated R^2 Score:", cv_scores.mean())


KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# Initialize Ridge model
ridge_model = Ridge(alpha=1.0)

# Cross-validation for Ridge
cv_scores_ridge = cross_val_score(ridge_model, X_scaled, y, cv=5, scoring='r2')
print("Ridge Regression Cross-validated R^2 Scores:", cv_scores_ridge)
print("Ridge Regression Average Cross-validated R^2 Score:", cv_scores_ridge.mean())

# Train and evaluate on the test set
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression Mean Squared Error: {mse_ridge:.4f}")
print(f"Ridge Regression R^2 Score: {r2_ridge:.4f}")


Ridge Regression Cross-validated R^2 Scores: [0.01761248 0.01783787 0.02249808 0.00072289 0.03072812]
Ridge Regression Average Cross-validated R^2 Score: 0.017879886700139003
Ridge Regression Mean Squared Error: 0.0284
Ridge Regression R^2 Score: 0.0099


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1, activation='linear')
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate on test data
y_pred_nn = model.predict(X_test)
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f"Neural Network Mean Squared Error: {mse_nn:.4f}")
print(f"Neural Network R^2 Score: {r2_nn:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 39ms/step - loss: 0.1014 - val_loss: 0.0392
Epoch 2/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0500 - val_loss: 0.0314
Epoch 3/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0393 - val_loss: 0.0312
Epoch 4/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0333 - val_loss: 0.0293
Epoch 5/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0318 - val_loss: 0.0282
Epoch 6/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0299 - val_loss: 0.0274
Epoch 7/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0309 - val_loss: 0.0266
Epoch 8/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0283 - val_loss: 0.0266
Epoch 9/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━

In [None]:
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score

# Apply PCA to reduce dimensions
pca = PCA(n_components=0.95)  # Retain 95% variance
X_pca = pca.fit_transform(X_scaled)

# Split PCA-transformed data
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train Ridge regression on PCA-transformed data
ridge_model_pca = Ridge(alpha=1.0)
ridge_model_pca.fit(X_train_pca, y_train)

# Evaluate
y_pred_pca = ridge_model_pca.predict(X_test_pca)
mse_pca = mean_squared_error(y_test, y_pred_pca)
r2_pca = r2_score(y_test, y_pred_pca)

print(f"PCA Ridge Regression Mean Squared Error: {mse_pca:.4f}")
print(f"PCA Ridge Regression R^2 Score: {r2_pca:.4f}")


PCA Ridge Regression Mean Squared Error: 0.0284
PCA Ridge Regression R^2 Score: 0.0088


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

# Create polynomial features up to degree 2
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Split polynomial-transformed data
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Train Ridge regression on polynomial-transformed data
ridge_model_poly = Ridge(alpha=1.0)
ridge_model_poly.fit(X_train_poly, y_train)

# Evaluate
y_pred_poly = ridge_model_poly.predict(X_test_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

print(f"Polynomial Ridge Regression Mean Squared Error: {mse_poly:.4f}")
print(f"Polynomial Ridge Regression R^2 Score: {r2_poly:.4f}")


Polynomial Ridge Regression Mean Squared Error: 0.0286
Polynomial Ridge Regression R^2 Score: -0.0000
