# CoLA (Corpus of Linguistic Acceptability)

In [47]:
import os
import numpy as np
import pandas as pd


# Paths and Variables

In [48]:
dataset_name = 'cola'

In [49]:
input_dir = './data'
output_dir = f'./../../processed/{dataset_name}/'
os.makedirs(output_dir, exist_ok=True)
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

In [50]:
train_fname = os.path.join(input_dir, 'in_domain_train.tsv')
test_fname = os.path.join(input_dir, 'in_domain_dev.tsv')

In [51]:
cols = ["acceptability", "text"]

# Read data into a DataFrame

In [52]:
train_data = pd.read_csv(train_fname, header=None, sep='\t')[[1, 3]]
train_data.columns = cols
print(train_data.shape)
train_data.head()

(8551, 2)


Unnamed: 0,acceptability,text
0,1,"Our friends won't buy this analysis, let alone..."
1,1,One more pseudo generalization and I'm giving up.
2,1,One more pseudo generalization or I'm giving up.
3,1,"The more we study verbs, the crazier they get."
4,1,Day by day the facts are getting murkier.


In [53]:
test_data = pd.read_csv(test_fname, header=None, sep='\t')[[1, 3]]
test_data.columns = cols
print(test_data.shape)
test_data.head()

(527, 2)


Unnamed: 0,acceptability,text
0,1,The sailors rode the breeze clear of the rocks.
1,1,The weights made the rope stretch over the pul...
2,1,The mechanical doll wriggled itself loose.
3,1,"If you had eaten more, you would want less."
4,0,"As you eat the most, you want the least."


In [54]:
id_col = "id"
target_col = "acceptability"
text_col = "text"

In [55]:
train_data[target_col].value_counts()

acceptability
1    6023
0    2528
Name: count, dtype: int64

# Prepare Data

### Drop NaN rows

In [56]:
train_data = train_data.dropna()
test_data = test_data.dropna()

# Insert Id Column

In [57]:
# insert Id column 
if id_col not in train_data.columns:
    N = train_data.shape[0]
    train_data.insert(0, id_col, np.arange(N))
    print(train_data.head())

if id_col not in test_data.columns:
    N = test_data.shape[0]
    test_data.insert(0, id_col, np.arange(train_data.shape[0], train_data.shape[0] + N))
    print(test_data.head())

   id  acceptability                                               text
0   0              1  Our friends won't buy this analysis, let alone...
1   1              1  One more pseudo generalization and I'm giving up.
2   2              1   One more pseudo generalization or I'm giving up.
3   3              1     The more we study verbs, the crazier they get.
4   4              1          Day by day the facts are getting murkier.
     id  acceptability                                               text
0  8551              1    The sailors rode the breeze clear of the rocks.
1  8552              1  The weights made the rope stretch over the pul...
2  8553              1         The mechanical doll wriggled itself loose.
3  8554              1        If you had eaten more, you would want less.
4  8555              0           As you eat the most, you want the least.


# Shuffle Data

In [58]:
# shuffle data
train_data = train_data.sample(frac=1, random_state=42)
print(train_data.shape)
train_data.head()

(8551, 3)


Unnamed: 0,id,acceptability,text
2389,2389,1,Angela characterized Shelly as a lifesaver.
5048,5048,1,They're not finding it a stress being in the s...
3133,3133,0,Paul exhaled on Mary.
5955,5955,0,I ordered if John drink his beer.
625,625,1,Press the stamp against the pad completely.


# Utility to Save DF as a zipped file

In [59]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

In [60]:
data = pd.concat([train_data, test_data], axis=0)
test_key = test_data[[id_col, target_col]].copy()
test_data = test_data.drop(columns=[target_col])

# Save Main Data File

In [61]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

In [62]:
# zip files
save_df_to_zipped_csv(train_data, "train")
save_df_to_zipped_csv(test_data, "test")
save_df_to_zipped_csv(test_key, "test_key")