In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

# Local imports
from preprocessing import clean_tweets, one_hot_encode, make_debug_df

### Davidson et al data

In [None]:
path = 'data/davidson/'
fname = '{}labeled_data.csv'.format(path)

In [None]:
# Load the data
df = pd.read_csv(fname, encoding='utf-8', index_col='Unnamed: 0')

In [None]:
df.head()

In [None]:
# Clean the tweets
df = clean_tweets(df)

# Convert columns to one hot encoding
df[['hate_speech', 'offensive_language', 'neither']] = \
    one_hot_encode(df['class'])
    
# Re-order the DataFrame, and drop some columns
df = df[['tweet', 'hate_speech', 'offensive_language',
         'neither', 'mentions', 'hashtags', 'original_tweet']]

In [None]:
# Make a test/dev/train split
train_perc = 0.80
msk = np.random.rand(len(df)) < train_perc
train = df[msk]
not_train = df[~msk]
half = int(len(not_train) / 2)
dev = not_train[:half]
test = not_train[half:]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
debug = make_debug_df(df)

In [None]:
debug.to_csv('{}debug.csv'.format(path), index=False)
train.to_csv('{}train.csv'.format(path), index=False)
dev.to_csv('{}dev.csv'.format(path), index=False)
test.to_csv('{}test.csv'.format(path), index=False)

### Zeerak data

In [None]:
path = 'data/zeerak_naacl/'
fname = '{}zeerak_naacl_tweets.csv'.format(path)
# Load the data
df2 = pd.read_csv(fname, encoding='utf-8')

In [None]:
# First mask out some missing data
msk = ~ df2['label'].apply(lambda t: type(t) is float)
df2 = df2[msk]

In [None]:
# Clean the tweets
df2.rename(index=str, columns={'text': 'tweet'}, inplace=True)
df2 = clean_tweets(df2)

# Label cleanup to match the other df format
labels = ['racism', 'sexism', 'none']
one_hot_label = [labels.index(l) for l in df2['label']]
for l in labels:
    df2[l] = -1
# Convert columns to one hot encoding
df2[['racism', 'sexism', 'none']] = \
    one_hot_encode(one_hot_label)
    
# Re-order the DataFrame, and drop some columns
df2 = df2[['tweet', 'label', 'mentions', 'hashtags', 'original_tweet',
           'racism', 'sexism', 'none', 'tweet_id', 'user_screen_name']]

In [None]:
# Make a test/dev/train split
train_perc = 0.80
msk = np.random.rand(len(df2)) < train_perc
train = df2[msk]
not_train = df2[~msk]
half = int(len(not_train) / 2)
dev = not_train[:half]
test = not_train[half:]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
debug = make_debug_df(df2, cols=labels)

In [None]:
debug.to_csv('{}debug.csv'.format(path), index=False)
train.to_csv('{}train.csv'.format(path), index=False)
dev.to_csv('{}dev.csv'.format(path), index=False)
test.to_csv('{}test.csv'.format(path), index=False)