In [1]:
import pandas as pd
import os
import glob
import re
from sklearn.model_selection import train_test_split

In [2]:
def read_data(filename):
    df = pd.read_csv(filename, sep='\t', header=None, index_col=False)
    print(f"Read in data from {filename} of length {df.shape[0]} ")
    return df

def concatenate_data(datapath):
    """Concatenate data from multiple files in a given path"""
    files = glob.glob(f'{datapath}/*.tsv')
    list_df = []
    for f in files:
        df = read_data(f)
        list_df.append(df)
    # Concatenate several smaller DataFrames into one larger one
    data = pd.concat(list_df, axis=0, ignore_index=True)
    # Drop spurious last column
    data.columns = ['id', 'labels', 'text', 'extra']
    data = data.drop('extra', axis=1)
    print(f"Total length of data read from '{datapath}': {data.shape[0]}")
    return data

In [3]:
train = concatenate_data('./data/raw/training/')

Read in data from ./data/raw/training/twitter-2016devtest-A.tsv of length 2000 
Read in data from ./data/raw/training/twitter-2014sarcasm-A.tsv of length 49 
Read in data from ./data/raw/training/twitter-2013dev-A.tsv of length 1654 
Read in data from ./data/raw/training/twitter-2016dev-A.tsv of length 1966 
Read in data from ./data/raw/training/twitter-2016test-A.tsv of length 20632 
Read in data from ./data/raw/training/twitter-2016train-A.tsv of length 5868 
Read in data from ./data/raw/training/twitter-2014test-A.tsv of length 1853 
Read in data from ./data/raw/training/twitter-2013train-A.tsv of length 9684 
Read in data from ./data/raw/training/twitter-2015train-A.tsv of length 489 
Read in data from ./data/raw/training/twitter-2015test-A.tsv of length 2390 
Read in data from ./data/raw/training/twitter-2013test-A.tsv of length 3547 
Total length of data read from './data/raw/training/': 50132


In [4]:
train.head(3)

Unnamed: 0,id,labels,text
0,637641175948763136,neutral,@SeeMonterey LOST - Sony cell phone with holid...
1,637651487762554881,neutral,"@PersonaSoda well yeah, that's third parties. ..."
2,637666734300905472,negative,Sony rewards app is like a lot of 19 y.o femal...


In [5]:
test = read_data('./data/raw/test/SemEval2017-task4-test.subtask-A.english.txt')
test.columns = ['id', 'labels', 'text']
test.head(3)

Read in data from ./data/raw/test/SemEval2017-task4-test.subtask-A.english.txt of length 11906 


Unnamed: 0,id,labels,text
0,801989080477154944,neutral,#ArianaGrande Ari By Ariana Grande 80% Full ht...
1,801989272341453952,positive,Ariana Grande KIIS FM Yours Truly CD listening...
2,801990978424962944,positive,Ariana Grande White House Easter Egg Roll in W...


In [6]:
test.tail()

Unnamed: 0,id,labels,text
11901,805699615781625856,positive,@dansen17 update: Zac Efron kissing a puppy ht...
11902,805701709356003328,neutral,#zac efron sex pic skins michelle sex https://...
11903,805701818357579776,neutral,First Look at Neighbors 2 with Zac Efron Shirt...
11904,805703557081075712,neutral,zac efron poses nude #lovely libra porn https:...
11905,805704324105940992,neutral,#Fashion #Style The Paperboy (NEW Blu-ray Disc...


### Assign string label to integer

In [7]:
label_map = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

def str2int(label: str):
    return label_map[label]

In [8]:
train['labels'] = train['labels'].apply(str2int)
train.head(3)

Unnamed: 0,id,labels,text
0,637641175948763136,1,@SeeMonterey LOST - Sony cell phone with holid...
1,637651487762554881,1,"@PersonaSoda well yeah, that's third parties. ..."
2,637666734300905472,0,Sony rewards app is like a lot of 19 y.o femal...


In [9]:
test['labels'] = test['labels'].apply(str2int)
test.head(3)

Unnamed: 0,id,labels,text
0,801989080477154944,1,#ArianaGrande Ari By Ariana Grande 80% Full ht...
1,801989272341453952,2,Ariana Grande KIIS FM Yours Truly CD listening...
2,801990978424962944,2,Ariana Grande White House Easter Egg Roll in W...


### Stratify data and split into training/dev sets

In [10]:
train, dev = train_test_split(
    train, stratify=train['labels'], test_size=0.1, random_state=353456
)

In [11]:
train.groupby('labels')['text'].count()

labels
0     7028
1    20271
2    17819
Name: text, dtype: int64

In [12]:
dev.groupby('labels')['text'].count()

labels
0     781
1    2253
2    1980
Name: text, dtype: int64

### Write processed data

In [13]:
output_path = './data/clean'
os.makedirs(output_path, exist_ok=True)

In [14]:
train[['id', 'text', 'labels']].to_csv(f'{output_path}/train.tsv', sep='\t', index=None)
dev[['id', 'text', 'labels']].to_csv(f'{output_path}/dev.tsv', sep='\t', index=None)
test[['id', 'text', 'labels']].to_csv(f'{output_path}/test.tsv', sep='\t', index=None)