In [None]:
import numpy as np
import pandas as pd

import math
import time

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
path = '/content/drive/My Drive/capstoneproject/acceptability_corpus/cola_public/raw/'

Mounted at /content/drive


In [None]:
train_df = pd.read_csv(path+'in_domain_train.tsv',sep="\t", header=None)
print(train_df.shape)
print(train_df.iloc[:,1].mean())
train_df.sample(5)

(8551, 4)
0.704362062916618


Unnamed: 0,0,1,2,3
3481,ks08,0,*,few equipment is available.
4340,ks08,0,*,John seems to rain.
107,cj99,1,,"I think that the more you eat, the less you want."
4929,ks08,0,*,Karen asked where for Washington to put the ch...
4133,ks08,1,,No John Smiths attended the meeting.


In [None]:
val_df = pd.read_csv(path+'in_domain_dev.tsv',sep="\t", header=None)
print(val_df.shape)
print(val_df.iloc[:,1].mean())
val_df.sample(5)

(527, 4)
0.6925996204933587


Unnamed: 0,0,1,2,3
420,m_02,1,,The window was broken with a hammer.
101,r-67,1,,I talked to Winston about himself.
302,ks08,1,,How did you guess that he fixed the computer?
488,ad03,1,,There is a programme about Euripides on a Radi...
97,r-67,0,*,I won't have some money.


In [None]:
test_df = pd.read_csv(path+'out_of_domain_dev.tsv',sep="\t", header=None)
print(test_df.shape)
print(test_df.iloc[:,1].mean())
test_df.sample(5)

(516, 4)
0.686046511627907


Unnamed: 0,0,1,2,3
463,w_80,0,*,It was hated for John to leave.
332,swb04,0,*,We walks.
255,swb04,1,,List associates of each defendant who speaks S...
242,swb04,1,,Lee saw the student with a telescope.
404,swb04,1,,There is a unicorn in the garden.


In [None]:
import keras
from keras import layers

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
# Concatenate train, validation, and test dataframes for consistency in preprocessing

combined_df = pd.concat([train_df, val_df, test_df]) #out of domain dev as test

# Assuming the text data is in a column named 'text'
X_train = train_df[3]
X_val = val_df[3]
X_test = test_df[3]


# Labels
y_train = train_df[1]
y_val = val_df[1]
y_test = test_df[1]

In [None]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
print("X_train_tfidf shape:", X_train_tfidf.shape)

X_train_tfidf shape: (8551, 5372)


In [None]:
X_train_dense = X_train_tfidf.toarray()
X_val_dense = X_val_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()

# Reshape dense arrays for RNN input
num_samples_train = X_train_dense.shape[0]
num_samples_val = X_val_dense.shape[0]
num_samples_test = X_test_dense.shape[0]

In [None]:
num_features = X_train_tfidf.shape[1]
num_timesteps = 1000

print("Number of features (dimensions):", num_features)

Number of features (dimensions): 5372


In [None]:
X_train_tfidf.shape

(8551, 5372)

In [None]:
X_train_dense[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
len(vectorizer.vocabulary_)

5372

In [None]:
import tensorflow as tf

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(vectorizer.vocabulary_),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
import tensorflow as tf

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_dense[:4000],y_train[:4000]))

In [None]:
train_ds = train_ds.batch(32)
train_ds = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
val_ds = tf.data.Dataset.from_tensor_slices((X_val_dense,y_val))
val_ds = val_ds.batch(32)
val_ds = val_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy', # binary_crossentropy
              metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=5,
    validation_data=val_ds, validation_steps=1)


Epoch 1/5
 18/125 [===>..........................] - ETA: 25:35 - loss: 1.2725 - accuracy: 0.4688

KeyboardInterrupt: 

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# Assuming you have a trained RNN model named 'model' and a test dataset named 'test_dataset'
# Make predictions on the test dataset
predictions = model.predict(test_dataset)

# Convert predictions and true labels to numpy arrays or lists
y_pred = np.argmax(predictions, axis=1)  # Assuming multi-class classification
# or
# y_pred = (predictions > 0.5).astype(int).flatten()  # Assuming binary classification

# Assuming true labels are stored in a variable 'true_labels'
y_true = true_labels  # Convert true labels to numpy array or list

# Compute confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Compute classification report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

# Compute area under ROC curve (AUC-ROC)
auc_roc = roc_auc_score(y_true, predictions)  # Assuming predictions are probabilities
print("Area Under ROC Curve (AUC-ROC):", auc_roc)

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy', # binary_crossentropy
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()])

In [None]:
history = model.fit(
    train_ds,
    epochs=10,
    validation_data=val_ds, validation_steps=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Predictions on validation set
val_predictions = model.predict(X_val_dense)



In [None]:
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:
# Predictions on test set
test_predictions = svm_classifier.predict(X_test_tfidf)
test1_predictions = svm_classifier.predict(X_test1_tfidf)