In [1]:
!pip install python-docx
!pip install scikit-optimize
from docx import Document
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV
from sklearn.preprocessing import OneHotEncoder
class Transcript:
    def __init__(self, transcript_id, base_pairs, length):
        self.transcript_id = transcript_id
        self.base_pairs = base_pairs
        self.length = length
from google.colab import drive
drive.mount('/content/gdrive')
Foldername = '/content/gdrive/My Drive/'
doc = Document(Foldername + 'Base Pairs.docx')
transcripts = []
for table in doc.tables:
    for row in table.rows[1:]:
        cells = [cell.text.strip() for cell in row.cells]
        transcript = Transcript(*cells)
        transcripts.append(transcript)


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2
Collecting scikit-optimize
  Downloading scikit_optimize-0.10.1-py2.py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.4.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.4.0 scikit-optimize-0.10.1
Mounted at /content/gdrive


In [2]:
df = pd.DataFrame([(t.transcript_id, t.base_pairs, t.length) for t in transcripts],
                  columns=['Transcript_ID', 'Base_Pairs', 'Length'])
label_encoder = LabelEncoder()
df['Encoded_Length'] = label_encoder.fit_transform(df['Length'])
x_train, x_test, y_train, y_test = train_test_split(df[['Transcript_ID', 'Base_Pairs']], df['Encoded_Length'], test_size=0.25, random_state=3)
gbm = GradientBoostingClassifier(n_estimators=122, learning_rate=0.06024151710706933, max_depth=5, min_samples_leaf=1)
gbm.fit(x_train['Base_Pairs'].str.extract(r'(\d+)'), y_train)
y_pred = gbm.predict(x_test['Base_Pairs'].str.extract(r'(\d+)'))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
results_df = pd.DataFrame({
    'True_Label': label_encoder.inverse_transform(y_test),
    'Predicted_Label': label_encoder.inverse_transform(y_pred),
    'Transcript_ID': x_test['Transcript_ID'],
    'Length': df.loc[y_test.index, 'Length']
})
print("Correct Predictions:")
print(results_df[results_df['True_Label'] == results_df['Predicted_Label']][['Transcript_ID', 'Length']])
print("\nIncorrect Predictions:")
print(results_df[results_df['True_Label'] != results_df['Predicted_Label']][['Transcript_ID', 'Length']])

Accuracy: 1.0
Correct Predictions:
        Transcript_ID  Length
12  ENST00000374231.8   Small
39  ENST00000629095.1  Medium
9   ENST00000580840.1  Medium
47  ENST00000576311.5   Small
31  ENST00000548101.1   Small
28  ENST00000713581.1   Large
13  ENST00000374214.3   Small
48  ENST00000576834.2   Small
45  ENST00000571430.5   Small
6   ENST00000618887.2  Medium
36  ENST00000309922.7  Medium
23  ENST00000402774.8   Large
37  ENST00000313511.8  Medium

Incorrect Predictions:
Empty DataFrame
Columns: [Transcript_ID, Length]
Index: []


In [None]:
param_space = {
    'learning_rate': (0.01, 0.2),
    'n_estimators': (50, 200),
    'max_depth': (3, 5),
    'min_samples_leaf': (1, 10)
}
encoder = OneHotEncoder()
x_train_encoded = encoder.fit_transform(x_train['Base_Pairs'].str.extract(r'(\d+)'))
bayes_search = BayesSearchCV(gbm, param_space, n_iter=50, cv=3, scoring='accuracy', random_state=0)
bayes_search.fit(x_train_encoded, y_train)
best_params = bayes_search.best_params_
best_score = bayes_search.best_score_
print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

Best Parameters: OrderedDict([('learning_rate', 0.06024151710706933), ('max_depth', 5), ('min_samples_leaf', 1), ('n_estimators', 122)])
Best Accuracy: 0.43162393162393164
