In [1]:
!pip install python-docx
from docx import Document
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
class Transcript:
    def __init__(self, transcript_id, base_pairs, length):
        self.transcript_id = transcript_id
        self.base_pairs = base_pairs
        self.length = length

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
Foldername = '/content/gdrive/My Drive/'
doc = Document(Foldername + 'Base Pairs.docx')
transcripts = []
for table in doc.tables:
    for row in table.rows[1:]:
        cells = [cell.text.strip() for cell in row.cells]
        transcript = Transcript(*cells)
        transcripts.append(transcript)

Mounted at /content/gdrive


In [3]:
df = pd.DataFrame([(t.transcript_id, t.base_pairs, t.length) for t in transcripts],
                  columns=['Transcript_ID', 'Base_Pairs', 'Length'])
label_encoder = LabelEncoder()
df['Encoded_Length'] = label_encoder.fit_transform(df['Length'])
X = df[['Transcript_ID', 'Base_Pairs']]
y = df['Encoded_Length']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)
all_columns = set(X_train_encoded.columns) | set(X_test_encoded.columns)
X_train_encoded = X_train_encoded.reindex(columns=all_columns, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=all_columns, fill_value=0)
mlp = MLPClassifier(hidden_layer_sizes = (3,), max_iter = 10000, learning_rate = 'adaptive', random_state = 1)
mlp.fit(X_train_encoded, y_train)
y_pred = mlp.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred) * 100


In [4]:
results_df = pd.DataFrame({
    'Transcript_ID': X_test['Transcript_ID'],
    'True_Label': y_test,
    'Predicted_Label': y_pred
})

correct_predictions_df = results_df[results_df['True_Label'] == results_df['Predicted_Label']]
incorrect_predictions_df = results_df[results_df['True_Label'] != results_df['Predicted_Label']]

print("Correct Predictions:")
print(correct_predictions_df)
print("\nIncorrect Predictions:")
print(incorrect_predictions_df)
print("\nAccuracy on test set:", accuracy, "%")

Correct Predictions:
        Transcript_ID  True_Label  Predicted_Label
12  ENST00000374231.8           2                2
47  ENST00000576311.5           2                2
31  ENST00000548101.1           2                2
13  ENST00000374214.3           2                2
48  ENST00000576834.2           2                2
45  ENST00000571430.5           2                2
37  ENST00000313511.8           1                1

Incorrect Predictions:
        Transcript_ID  True_Label  Predicted_Label
39  ENST00000629095.1           1                2
9   ENST00000580840.1           1                2
28  ENST00000713581.1           0                2
6   ENST00000618887.2           1                2
36  ENST00000309922.7           1                2
23  ENST00000402774.8           0                2

Accuracy on test set: 53.84615384615385 %


In [None]:
param_grid = {
    'hidden_layer_sizes': [(3,), (5,), (3, 5), (2, 2), (5, 7), (3, 5, 7)],
}
grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_encoded, y_train)
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Parameters: {'hidden_layer_sizes': (3,)}
Best Accuracy: 0.4321428571428571
