# Import Libraries

In [38]:
# standard library
import pandas as pd

# drive access
from google.colab import drive
drive.mount('/content/drive')

# train/val split
from sklearn.model_selection import train_test_split

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load CoLA dataset (n=9594 sentences, split into train/dev)

- **raw**:
  - in domain train (n=8551) / dev set (n=527)
  - out of domain dev set (n=516)

**NOTE: Original CoLA paper mentions > 10K sentences, but that includes the held-out test set found in Kaggle competitions. The 'test' set mentioned in this notebook is actually the validation set mentioned in the paper.**

# Load the datasets

In [39]:
train = pd.read_csv('/content/drive/MyDrive/in_domain_train.tsv', sep='\t', names=['source','acceptability','authors_acceptability','sentence'])

train.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.


In [40]:
test_in = pd.read_csv('/content/drive/MyDrive/in_domain_dev.tsv', sep='\t', names=['source','acceptability','authors_acceptability','sentence'])

test_in.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence
0,gj04,1,,The sailors rode the breeze clear of the rocks.
1,gj04,1,,The weights made the rope stretch over the pul...
2,gj04,1,,The mechanical doll wriggled itself loose.
3,cj99,1,,"If you had eaten more, you would want less."
4,cj99,0,*,"As you eat the most, you want the least."


In [41]:
test_out = pd.read_csv('/content/drive/MyDrive/out_of_domain_dev.tsv', sep='\t', names=['source','acceptability','authors_acceptability','sentence'])

test_out.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence
0,clc95,1,,Somebody just left - guess who.
1,clc95,1,,"They claimed they had settled on something, bu..."
2,clc95,1,,"If Sam was going, Sally would know where."
3,clc95,1,,"They're going to serve the guests something, b..."
4,clc95,1,,She's reading. I can't imagine what.


## add a column to indicate whether in or out of domain

In [42]:
train['domain'] = 'IN'

train.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,"Our friends won't buy this analysis, let alone...",IN
1,gj04,1,,One more pseudo generalization and I'm giving up.,IN
2,gj04,1,,One more pseudo generalization or I'm giving up.,IN
3,gj04,1,,"The more we study verbs, the crazier they get.",IN
4,gj04,1,,Day by day the facts are getting murkier.,IN


In [43]:
test_in['domain'] = 'IN'

test_in.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,The sailors rode the breeze clear of the rocks.,IN
1,gj04,1,,The weights made the rope stretch over the pul...,IN
2,gj04,1,,The mechanical doll wriggled itself loose.,IN
3,cj99,1,,"If you had eaten more, you would want less.",IN
4,cj99,0,*,"As you eat the most, you want the least.",IN


In [44]:
test_out['domain'] = 'OUT'

test_out.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,clc95,1,,Somebody just left - guess who.,OUT
1,clc95,1,,"They claimed they had settled on something, bu...",OUT
2,clc95,1,,"If Sam was going, Sally would know where.",OUT
3,clc95,1,,"They're going to serve the guests something, b...",OUT
4,clc95,1,,She's reading. I can't imagine what.,OUT


## Merge test_in and test_out to create test

In [45]:
test = pd.concat([test_in, test_out])

test.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,The sailors rode the breeze clear of the rocks.,IN
1,gj04,1,,The weights made the rope stretch over the pul...,IN
2,gj04,1,,The mechanical doll wriggled itself loose.,IN
3,cj99,1,,"If you had eaten more, you would want less.",IN
4,cj99,0,*,"As you eat the most, you want the least.",IN


In [46]:
assert len(test) == len(test_in) + len(test_out)

# Clean dataset

## Drop unwanted columns

In [47]:
train.drop(columns=['source', 'authors_acceptability'], inplace=True)
test.drop(columns=['source', 'authors_acceptability'], inplace=True)

In [48]:
train.head()

Unnamed: 0,acceptability,sentence,domain
0,1,"Our friends won't buy this analysis, let alone...",IN
1,1,One more pseudo generalization and I'm giving up.,IN
2,1,One more pseudo generalization or I'm giving up.,IN
3,1,"The more we study verbs, the crazier they get.",IN
4,1,Day by day the facts are getting murkier.,IN


In [49]:
test.head()

Unnamed: 0,acceptability,sentence,domain
0,1,The sailors rode the breeze clear of the rocks.,IN
1,1,The weights made the rope stretch over the pul...,IN
2,1,The mechanical doll wriggled itself loose.,IN
3,1,"If you had eaten more, you would want less.",IN
4,0,"As you eat the most, you want the least.",IN


In [50]:
len(train)

8551

In [51]:
len(train[train['acceptability']==1])/len(train)

0.704362062916618

In [52]:
len(test)

1043

In [54]:
len(test[test['acceptability']==1])/len(test)

0.6893576222435283

# Train:test split

In [55]:
training, validation = train_test_split(train, test_size=0.2, random_state=1234)

In [56]:
training.head()

Unnamed: 0,acceptability,sentence,domain
5007,1,To please John is tough.,IN
7480,1,John did not like Mary.,IN
5454,0,John is not more reliable a fellow than Bill.,IN
2175,1,Joan knew the answer.,IN
3170,1,The baby dressed.,IN


In [57]:
len(training)

6840

In [58]:
len(training[training['acceptability']==1])/len(training)

0.702046783625731

In [59]:
validation.head()

Unnamed: 0,acceptability,sentence,domain
4749,1,Which man did you talk to?,IN
7987,1,What she thought was that the poison was neutr...,IN
3851,1,The teacher made students happy.,IN
8430,0,I have sent 0 letter to Environmental Heath,IN
7780,0,We believed to be omnipotent.,IN


In [60]:
len(validation)

1711

In [61]:
len(validation[validation['acceptability']==1])/len(validation)

0.7136177673874927

# Save files

In [62]:
training.to_csv('/content/drive/MyDrive/cola_raw_unbalanced_train.csv', index=False)
validation.to_csv('/content/drive/MyDrive/cola_raw_unbalanced_val.csv', index=False)
test.to_csv('/content/drive/MyDrive/cola_raw_unbalanced_test.csv', index=False)