# Dataset Statistics Table 2 

In [8]:
import pandas as pd
import os
def load_conll_dataset_to_data_frame(file_name, split_name='no_split', delimiter='\t'):
    with open(file_name, "r") as f:
        lines = f.readlines()
    sent_id = 1
    tuple_lst = []
    for line in lines :
        if not line.strip():
            sent_id += 1
        else :
            token, label = line.strip().split(delimiter)
            tuple_lst.append((sent_id, token, label, split_name))
    
    df = pd.DataFrame.from_records(tuple_lst, columns=['sent_id', 'token', 'label', 'split'])

    return df

In [23]:
path_to_data = "/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/data/"
datasets = ['ATIS', 'MIT_Restaurant', 'MIT_Movie', 'CONLL_2003_NER', 'OntoNotes_NW' ]

data = { dataset:None for dataset in datasets}
os.path.join(path_to_data, datasets[0])

'/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/data/ATIS'

In [24]:
for dataset in datasets:
    # Delimiter, honestly this is a bit ugly 
    if dataset == 'OntoNotes_NW':
        delimiter = ' '
    else :
        delimiter = '\t'
    df_train = load_conll_dataset_to_data_frame(os.path.join(path_to_data, dataset, "train.txt.ori"), "train", delimiter=delimiter)
    df_dev   = load_conll_dataset_to_data_frame(os.path.join(path_to_data, dataset, "dev.txt"), "dev", delimiter=delimiter)
    df_test  = load_conll_dataset_to_data_frame(os.path.join(path_to_data, dataset, "test.txt"), "test", delimiter=delimiter)
    
    data[dataset] = pd.concat([df_train, df_dev, df_test])

## Dataset properties
#### Number of sentences in train/dev/test

In [27]:
for dataset in datasets:
    df = data[dataset]
    nb_train_sent = df[df['split'] == 'train']['sent_id'].nunique()
    nb_dev_sent = df[df['split'] == 'dev']['sent_id'].nunique()
    nb_test_sent = df[df['split'] == 'test']['sent_id'].nunique()
    print("Dataset {:15} :  {:5} / {:5} / {:5} ".format(dataset, nb_train_sent, nb_dev_sent, nb_test_sent))

Dataset ATIS            :   4478 /   500 /   893 
Dataset MIT_Restaurant  :   6128 /  1532 /  1521 
Dataset MIT_Movie       :   7820 /  1955 /  2443 
Dataset CONLL_2003_NER  :  14987 /  3466 /  3684 
Dataset OntoNotes_NW    :  34970 /  5896 /  2327 


#### Number of unique slots in the training set

In [28]:
for dataset in datasets:
    df = data[dataset]
    filtered_df = df[(df['label'].str.startswith('B-')) & (df['split'] =='train')]
    nb_unique_slot = filtered_df['label'].nunique()
    print("Dataset {:15} : {:2} unique slots in the training set".format(dataset, nb_unique_slot))

Dataset ATIS            : 79 unique slots in the training set
Dataset MIT_Restaurant  :  8 unique slots in the training set
Dataset MIT_Movie       : 12 unique slots in the training set
Dataset CONLL_2003_NER  :  4 unique slots in the training set
Dataset OntoNotes_NW    : 18 unique slots in the training set


#### Number of unique tokens in the training set

In [39]:
for dataset in datasets:
    df = data[dataset]
    df_train = df[df['split'] == 'train']
    #print(len(df_train['token'].str.lower().unique()))
    print("Dataset {:15} : {:5} unique tokens in the training set".format(dataset, len(df_train['token'].str.lower().unique())))


Dataset ATIS            :   869 unique tokens in the training set
Dataset MIT_Restaurant  :  3385 unique tokens in the training set
Dataset MIT_Movie       :  5953 unique tokens in the training set
Dataset CONLL_2003_NER  : 21010 unique tokens in the training set
Dataset OntoNotes_NW    : 34662 unique tokens in the training set


## Table 4 Overall Results

<img src="scai/Table_2_SCAI.png">

##### Results directory for each experiments (30 epochs):
### STL
- STL ATIS  : results/SingleTask_ATIS_Full/performance.out
- STL MIT-R : results/SingleTask_MIT_Restaurant_Full/performance.out
- STL MIT-M : results/SingleTask_MIT_Movie_Full/performance.out

### MTL, same supervision level
- MTL ATIS, Most Similar, NO NER : SCRATCH_2_4
- MTL MIT-R, Most Similar, NO NER : results_emnlp/MIT_Restaurant_Most_Similar/performance.out
- MTL MIT-M, Most Similar, NO NER : results_emnlp/MIT_Movie_Most_Similar/performance.out
- MTL ATIS, ALL, NO NER :  results_emnlp/ATIS_ALL_NO_NER/performance.out 
- MTL MIT Restaurant, ALL, NO NER : results_emnlp/MIT_Restaurant_ALL_NO_NER/performance.out
- MTL MIT Movie, ALL, NO NER : results_emnlp/MIT_Movie_ALL_NO_NER/performance.out
- MTL ATIS, Most similar, NER :  results_emnlp/ATIS_MOST_SIMILAR_NER_SAME_LEVEL/performance.out
- MTL MIT-R, Most Similar, NER : results_emnlp/MIT_Restaurant_MOST_SIMILAR_NER_SAME_LEVEL/performance.out 
- MTL MIT-M, Most Similar, NER : results_emnlp/MIT_Movie_MOST_SIMILAR_NER_SAME_LEVEL/performance.out 
- MTL ATIS, ALL, NER :  results_emnlp/SCRATCH_2_4/performance.out
- MTL MIT-R, ALL, NER : results_emnlp/MIT_Restaurant_ALL/performance.out (confusing)
- MTL MIT-Movie, ALL, NER : results_emnlp/MIT_Movie_ALL/performance.out
- MTL ATIS, NER only : results_emnlp/ATIS_NER_ONLY_SAME_LEVEL/performance.out 95.71
- MTL MIT-R, NER only : results_emnlp/MIT_Restaurant_NER_ONLY_SAME_LEVEL/performance.out 
- MTL MIT-M, NER only : results_emnlp/MIT_Movie_NER_ONLY_SAME_LEVEL/performance.out 


### MTL, different supervision level
- MTL ATIS, Most Similar, NER :  results_emnlp/ATIS_MOST_SIMILAR_NER_LOWER_LEVEL/performance.out
- MTL MIT-Restaurant, Most Similar, NER : 
 results_emnlp/MIT_Restaurant_MOST_SIMILAR_NER_LOWER_LEVEL/performance.out
- MTL MIT Movie, Most Similar, NER : results_emnlp/MIT_Movie_MOST_SIMILAR_NER_LOWER_LEVEL/performance.out
- MTL ATIS, ALL, NER : results_emnlp/ATIS_ALL_NER_LOWER_LEVEL/performance.out 
- MTL MIT-R, ALL, NER : results_emnlp/MIT_Restaurant_ALL_NER_LOWER_LEVEL/performance.out
- MTL MIT-M, ALL, NER : results_emnlp/MIT_Movie_ALL_NER_LOWER_LEVEL/performance.out
- MTL ATIS, NER only : results_emnlp/ATIS_NER_ONLY_DIFFERENT_LEVEL/performance.out
- MTL MIT-R, NER only :
results_emnlp/MIT_Restaurant_NER_ONLY_DIFFERENT_LEVEL/performance.out
- MTL MIT-M, NER only :results_emnlp/MIT_Movie_NER_ONLY_DIFFERENT_LEVEL/performance.out



## Command for STL baseline
Example for ATIS :
```python Train_STL_Sequence_Tagging.py -ro reproduce -d STL_ATIS -i input/ATIS -p params/STL_Default_Param -e 25``` (**CHECKED **)

## Command for MTL same supervision level
- Most similar, no NER :  
``` python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy most_similar -ro reproduce  -d ATIS_MOST_SIMILAR_NO_NER -e 25 -p params/MTL_Default_Param```  


- All, no NER :  
``` python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy all -ro reproduce  -d ATIS_ALL_NO_NER -e 25 -p params/MTL_Default_Param```  


- Most similar, NER :  
``` python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy most_similar -ro reproduce  -d ATIS_MOST_SIMILAR_NER_SAME_LEVEL -ner 1  -e 25 -p params/MTL_Default_Param```  


- All, NER :  
``` python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy all -ro reproduce -d ATIS_ALL_NER_SAME_LEVEL -ner 1  -e 25 -p params/MTL_Default_Param```  


- NER only :  
```python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy none -ro reproduce  -d ATIS_NER_ONLY_SAME_LEVEL -ner 1  -e 25 -p params/MTL_Default_Param```

## Command for MTL different supervision level

- Most similar, NER :  
``` python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy most_similar -diff-level 1 -ro reproduce  -diff-level 1 -d ATIS_MOST_SIMILAR_NER_DIFFERENT_LEVEL -ner 1  -e 25 -p params/MTL_Default_Param```  


- All, NER :  
``` python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy all -ro reproduce -diff-level 1 -d ATIS_ALL_NER_DIFFERENT_LEVEL -ner 1  -e 25 -p params/MTL_Default_Param```  


- NER only :  
```python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy none -ro reproduce -diff-level 1 -d ATIS_NER_ONLY_DIFFERENT_LEVEL -ner 1  -e 25 -p params/MTL_Default_Param```

# Table 5 Label Embedding
<img src="scai/Table_3_SCAI.png">

- MTL + Label Embedding :  
    - ATIS :  
    ```/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/results_emnlp/ATIS_ALL_NER_LOWER_LEVEL_LABEL_EMBEDDING ```
    - MIT-R :  
    ```/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/results_emnlp/MIT_Restaurant_MOST_SIMILAR_NER_LOWER_LEVEL_LABEL_EMBEDDING ```  
    - MIT-M :  
```/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/results_emnlp/MIT_Movie_ALL_LABEL_EMBEDDING ```

### Command

```python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy all -ro results_emnlp -diff-level 1 -d ATIS_ALL_NER_LOWER_LEVEL_LABEL_EMBEDDING -ner 1 -label-embedding label_embedding -e 25 -p params/MTL_Default_Param```

```python Train_MTL_Sequence_Tagging_Selective.py -target MIT_Restaurant  -strategy most_similar -ro results_emnlp -diff-level 1 -d MIT_Restaurant_MOST_SIMILAR_NER_LOWER_LEVEL_LABEL_EMBEDDING -ner 1 -label-embedding label_embedding -e 25 -p params/MTL_Default_Param```

```python Train_MTL_Sequence_Tagging_Selective.py -target MIT_Movie  -strategy all -ro results_emnlp  -d MIT_Movie_ALL_LABEL_EMBEDDING -label-embedding label_embedding -e 25 -p params/MTL_Default_Param```

# Low Resource Scenario
<img src="scai/SCAI_Low_Resource.png">

#### Result directories :
SingleTask_ATIS_200/performance.out  
SingleTask_ATIS_400/performance.out  
SingleTask_ATIS_800/performance.out   
SingleTask_MIT_Restaurant_200/performance.out  
SingleTask_MIT_Restaurant_400/performance.out  
SingleTask_MIT_Restaurant_800/performance.out  
SingleTask_MIT_Movie_200/performance.out  
SingleTask_MIT_Movie_400/performance.out  
SingleTask_MIT_Movie_800/performance.out  


## Command
#python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy all -ro reproduce -diff-level 1 -d ATIS_ALL_NER_LOWER_LEVEL_200 -ner 1  -e 25 -p params/MTL_Default_Param -n 200
#wait
#python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy all -ro reproduce -diff-level 1 -d ATIS_ALL_NER_LOWER_LEVEL_400 -ner 1  -e 25 -p params/MTL_Default_Param -n 400
#wait
#python Train_MTL_Sequence_Tagging_Selective.py -target ATIS  -strategy all -ro reproduce -diff-level 1 -d ATIS_ALL_NER_LOWER_LEVEL_800 -ner 1  -e 25 -p params/MTL_Default_Param -n 800
#wait

#python Train_MTL_Sequence_Tagging_Selective.py -target MIT_Restaurant  -strategy most_similar -ro reproduce -diff-level 1 -d MIT_Restaurant_MOST_SIMILAR_NER_LOWER_LEVEL_200 -ner 1  -e 25 -p params/MTL_Default_Param -n 200
#wait
#python Train_MTL_Sequence_Tagging_Selective.py -target MIT_Restaurant  -strategy most_similar -ro reproduce -diff-level 1 -d MIT_Restaurant_MOST_SIMILAR_NER_LOWER_LEVEL_400 -ner 1  -e 25 -p params/MTL_Default_Param -n 400
#wait
#python Train_MTL_Sequence_Tagging_Selective.py -target MIT_Restaurant  -strategy most_similar -ro reproduce -diff-level 1 -d MIT_Restaurant_MOST_SIMILAR_NER_LOWER_LEVEL_800 -ner 1  -e 25 -p params/MTL_Default_Param -n 800
#wait

#python Train_MTL_Sequence_Tagging_Selective.py -target MIT_Movie  -strategy all -ro reproduce  -d MIT_Movie_ALL_200   -e 25 -p params/MTL_Default_Param -n 200
#wait
#python Train_MTL_Sequence_Tagging_Selective.py -target MIT_Movie  -strategy all -ro reproduce  -d MIT_Movie_ALL_400   -e 25 -p params/MTL_Default_Param -n 400
#wait
#python Train_MTL_Sequence_Tagging_Selective.py -target MIT_Movie  -strategy all -ro reproduce  -d MIT_Movie_ALL_800   -e 25 -p params/MTL_Default_Param -n 800



In [44]:

delimiter = "\t"
df_train = load_conll_dataset_to_data_frame("/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/data/OntoNotes_NW/train.txt.ori", "train", " ")
df_dev   = load_conll_dataset_to_data_frame("/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/data/OntoNotes_NW/dev.txt", "dev", " ")
df_test  = load_conll_dataset_to_data_frame("/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/data/OntoNotes_NW/test.txt", "test", " ")


df = pd.concat([df_train, df_dev, df_test])

In [45]:
# Number of sentence in TRAIN
df[df['split'] == 'train']['sent_id'].nunique()

34970

In [46]:
# Number of sentence in DEV
df[df['split'] == 'dev']['sent_id'].nunique()

5896

In [47]:
# Number of sentence in TEST
df[df['split'] == 'test']['sent_id'].nunique()

2327

In [48]:
# Number of unique labels in the training split
filtered_df = df[(df['label'].str.startswith('B-')) & (df['split'] =='train')]
filtered_df.nunique()



sent_id    26498
token      10676
label         18
split          1
dtype: int64

In [49]:
# Number of unique tokens in the training split
# Lowercase?
df_train = df[df['split'] == 'train']
len(df_train['token'].str.lower().unique())

34662

In [30]:
df_train

Unnamed: 0,sent_id,token,label,split
0,1,-DOCSTART-,O,train
1,2,EU,B-ORG,train
2,2,rejects,O,train
3,2,German,B-MISC,train
4,2,call,O,train
5,2,to,O,train
6,2,boycott,O,train
7,2,British,B-MISC,train
8,2,lamb,O,train
9,2,.,O,train
