In [None]:
#| hide
import pandas as pd
from sklearn.model_selection import train_test_split

from en_grammar_checker.config import Config

In [None]:
#| hide
cnfg = Config()

In [None]:
#| hide
dataset_path = "../data/cola_public/raw/"

In [None]:
#| hide
dataset_path

'../data/cola_public/raw/'

In [None]:
#| hide
df_train = pd.read_csv(
    f"{dataset_path}in_domain_train.tsv",
    delimiter="\t",
    header=None,
    names=["sentence_source", "label", "label_notes", "sentence"],
)

In [None]:
df_train.head()

Unnamed: 0,sentence_source,label,label_notes,sentence
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.


In [None]:
df_train.iloc[0].sentence

"Our friends won't buy this analysis, let alone the next one we propose."

In [None]:
df_train.sentence_source.value_counts()

sentence_source
ks08      1745
l-93      1294
r-67       916
ad03       852
c_13       778
bc01       772
sks13      573
m_02       378
b_73       238
cj99       210
d_98       162
sgww85     139
rhl07      134
gj04        99
g_81        99
b_82        83
kl93        79
Name: count, dtype: int64

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8551 entries, 0 to 8550
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   sentence_source  8551 non-null   object
 1   label            8551 non-null   int64 
 2   label_notes      2527 non-null   object
 3   sentence         8551 non-null   object
dtypes: int64(1), object(3)
memory usage: 267.3+ KB


In [None]:
df_train.label.value_counts()

label
1    6023
0    2528
Name: count, dtype: int64

In [None]:
df_train.sentence.apply(lambda x: len(x.split("."))).value_counts()

sentence
2    7538
1     990
3      20
4       3
Name: count, dtype: int64

In [None]:
# df_train.sentence.apply(lambda x: len(x.split('.')))

In [None]:
df_train.sentence.apply(lambda x: len(x.split(" "))).describe()

count    8551.000000
mean        7.696059
std         3.622946
min         2.000000
25%         5.000000
50%         7.000000
75%         9.000000
max        42.000000
Name: sentence, dtype: float64

## Prepare new Data

In [None]:
#| hide
train_data_2 = pd.read_csv("../data/ged_data/train_data.csv")
val_data_2 = pd.read_csv("../data/ged_data/val_data.csv")

In [None]:
#| hide
train_data_2 = train_data_2.rename(columns={"input": "sentence", "labels": "label"})
val_data_2 = val_data_2.rename(columns={"input": "sentence", "labels": "label"})

data_2_df = (
    pd.concat([train_data_2, val_data_2])
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

In [None]:
#| hide
train_data_2.shape, val_data_2.shape, data_2_df.shape

((19998, 2), (10000, 2), (29998, 2))

In [None]:
data_2_df.head(1)

Unnamed: 0,sentence,label
0,It was really delicious ! !,1


In [None]:
#| hide
train_data_1 = pd.read_csv(
    f"../data/cola_public/raw/in_domain_train.tsv",
    delimiter="\t",
    header=None,
    names=["sentence_source", "label", "label_notes", "sentence"],
)

In [None]:
#| hide
val_data_1 = pd.read_csv(
    f"../data/cola_public/raw/in_domain_dev.tsv",
    delimiter="\t",
    header=None,
    names=["sentence_source", "label", "label_notes", "sentence"],
)

In [None]:
#| hide
test_data_1 = pd.read_csv(
    f"../data/cola_public/raw/out_of_domain_dev.tsv",
    delimiter="\t",
    header=None,
    names=["sentence_source", "label", "label_notes", "sentence"],
)

In [None]:
#| hide
train_data_1 = train_data_1[["sentence", "label"]]
val_data_1 = val_data_1[["sentence", "label"]]
test_data_1 = test_data_1[["sentence", "label"]]

data_1_df = (
    pd.concat([train_data_1, val_data_1])
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

In [None]:
#| hide
data_1_df.shape

(9078, 2)

In [None]:
#| hide
data_1_df.head(2)

Unnamed: 0,sentence,label
0,The only offer of which I plan to take advanta...,1
1,I would like to might do it,0


In [None]:
#| hide
data_2_df.head(2)

Unnamed: 0,sentence,label
0,It was really delicious ! !,1
1,I think my wrritten English is Chinglish .,0


In [None]:
#| hide
data_df = (
    pd.concat([data_1_df, data_2_df])
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

In [None]:
#| hide
data_df.shape

(39076, 2)

In [None]:
#| hide
train_df, test_df = train_test_split(data_df, test_size=0.1, random_state=42)
final_train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [None]:
#| hide
final_test_df = (
    pd.concat([test_df, test_data_1])
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

In [None]:
#| hide
final_train_df

Unnamed: 0,sentence,label
0,"At the time , Japan suffered from chronic ener...",1
1,But I am not sad .,1
2,What is the different between relevant to and ...,0
3,Medea wondered if that the potion was ready,0
4,Then I went a pharmacy to buy medicine for a c...,0
...,...,...
35163,But I ca not write English well and speak .,0
35164,We had happily listened to all the latest news .,1
35165,"overall , i am convinced that governments shou...",0
35166,Retrace this year,0


In [None]:
#| hide
final_test_df

Unnamed: 0,sentence,label
0,"If I have things that I want to do , I have to...",1
1,I am hungry .,1
2,The mouse nibbled the cheese.,1
3,I think that is what is important .,1
4,Someone in the residents is a flower designer .,1
...,...,...
4419,And the dances were in the orbit of the planets .,1
4420,Here is Sydney and it is possible that this ki...,1
4421,massive earthquake,0
4422,I understand it just a little .,0


In [None]:
#| hide
final_train_df.to_csv('../data/merged_data/train_data.csv',index=False)

In [None]:
#| hide
final_test_df.to_csv('../data/merged_data/test_data.csv',index=False)