In [None]:
import numpy as np
import pandas as pd

import math
import time

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
path = '/content/drive/My Drive/capstoneproject/acceptability_corpus/cola_public/raw/'

Mounted at /content/drive


In [None]:
train_df = pd.read_csv(path+'in_domain_train.tsv',sep="\t", header=None)
print(train_df.shape)
print(train_df.iloc[:,1].mean())
train_df.sample(5)

(8551, 4)
0.704362062916618


Unnamed: 0,0,1,2,3
1990,r-67,1,,That anybody ever left at all is not known.
3573,ks08,1,,He left.
3953,ks08,1,,The sexual revolution makes some people uncomf...
385,bc01,1,,The question of whether John met Mary worries ...
6785,m_02,1,,"Mrs Bennet having taken the others upstairs, M..."


In [None]:
val_df = pd.read_csv(path+'in_domain_dev.tsv',sep="\t", header=None)
print(val_df.shape)
print(val_df.iloc[:,1].mean())
val_df.sample(5)

(527, 4)
0.6925996204933587


Unnamed: 0,0,1,2,3
291,ks08,1,,"As a statesman, scarcely could he do anything ..."
7,cj99,1,,"Mary listens to the Grateful Dead, she gets de..."
185,l-93,1,,Linda taped the picture onto the wall.
299,ks08,1,,Who do you think Tom saw?
441,sks13,1,,Henri wants to buy which books about cooking?


In [None]:
test_df = pd.read_csv(path+'out_of_domain_dev.tsv',sep="\t", header=None)
print(test_df.shape)
print(test_df.iloc[:,1].mean())
test_df.sample(5)

(516, 4)
0.686046511627907


Unnamed: 0,0,1,2,3
358,swb04,1,,I met the person who left.
333,swb04,0,*,Few dog barked.
225,swb04,1,,Our friends like us.
379,swb04,0,*,Lou hoped the umbrella in the closet.
340,swb04,1,,The person responsible confessed.


In [None]:
import keras
from keras import layers

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
# Concatenate train, validation, and test dataframes for consistency in preprocessing

combined_df = pd.concat([train_df, val_df, test_df]) #out of domain dev as test

# Assuming the text data is in a column named 'text'
X_train = train_df[3]
X_val = val_df[3]
X_test = test_df[3]


# Labels
y_train = train_df[1]
y_val = val_df[1]
y_test = test_df[1]

In [None]:
X_train

"One more pseudo generalization and I'm giving up."

In [None]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
print("X_train_tfidf shape:", X_train_tfidf.shape)

X_train_tfidf shape: (8551, 5372)


In [None]:
X_train_dense = X_train_tfidf.toarray()
X_val_dense = X_val_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()

# Reshape dense arrays for RNN input
num_samples_train = X_train_dense.shape[0]
num_samples_val = X_val_dense.shape[0]
num_samples_test = X_test_dense.shape[0]

In [None]:
num_features = X_train_tfidf.shape[1]
num_timesteps = 1000

print("Number of features (dimensions):", num_features)

Number of features (dimensions): 5372


In [None]:
X_train_tfidf.shape

(8551, 5372)

In [None]:
X_train_dense[0]

AttributeError: 'numpy.ndarray' object has no attribute 'unique_vals'

In [None]:
len(vectorizer.vocabulary_)

5372

In [None]:
import tensorflow as tf

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(vectorizer.vocabulary_),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
import tensorflow as tf

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_dense[:100],y_train[:100]))

In [None]:
train_ds = train_ds.batch(32)
train_ds = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
val_ds = tf.data.Dataset.from_tensor_slices((X_val_dense[:100],y_val[:100]))
val_ds = val_ds.batch(32)
val_ds = val_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy', # binary_crossentropy
              metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=1,
    validation_data=val_ds, validation_steps=1)




## Sentence Embeddings

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd

# Load the pre-trained ELMo model from TensorFlow Hub
elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3")

# Function to convert a sentence to its ELMo embedding
def get_elmo_embedding(sentence):
    embeddings = elmo([sentence], signature="default", as_dict=True)["default"]
    return embeddings



In [None]:
elmo = hub.load("https://tfhub.dev/google/elmo/3").signatures["default"]

x = ["Hello world!"]
embeddings = elmo(tf.constant(x))["elmo"]

print(embeddings.shape)
print(embeddings.numpy())

(1, 2, 1024)
[[[-9.44582224e-02 -1.31386292e+00 -8.98074687e-01 ... -3.92261386e-01
    2.20464543e-03 -4.63068485e-04]
  [ 3.34592372e-01 -7.10093439e-01  3.08495671e-01 ... -6.58163950e-02
    1.01716205e-01  4.98784930e-02]]]


In [None]:
x = ["Hello world!","hi"]
embeddings = elmo(tf.constant(x))["elmo"]

print(embeddings.shape)
print(embeddings.numpy())

(2, 2, 1024)
[[[-9.44579169e-02 -1.31386268e+00 -8.98074329e-01 ... -3.92261147e-01
    2.20451504e-03 -4.63033095e-04]
  [ 3.34592402e-01 -7.10093141e-01  3.08495581e-01 ... -6.58161566e-02
    1.01716325e-01  4.98787910e-02]]

 [[-4.79994237e-01 -4.48762253e-03 -1.34896442e-01 ... -2.20424011e-01
   -1.97931200e-01  1.28346235e-02]
  [-2.84083858e-02 -4.35321555e-02  4.13016342e-02 ...  2.58316752e-02
   -1.42983329e-02 -1.65042020e-02]]]


In [None]:
def get_elmo_embeds(sentences):
  embeds = elmo(tf.constant(sentences))["elmo"]
  return embeds

In [None]:
X_train_embeds = get_elmo_embeds(X_train[:10])

In [None]:
X_train_embeds.shape

TensorShape([10, 13, 1024])