In [2]:
import pandas as pd 
import os
from tqdm import tqdm
tqdm.pandas()
from datetime import datetime
import re, html, copy
import seaborn as sns

In [37]:
tweet_filepath = os.path.join('C:\DATA\ComBERT\data_finetuning', 'tweets_imbalanced_286399.csv')
company_filepath = os.path.join('C:\DATA\ComBERT\data', 'company_info_sec_cik_mapper_12057_20220802.csv')

save_test_filepath = os.path.join('C:\DATA\ComBERT\data_finetuning', 'test_10000.csv')
save_train_filepath_format = os.path.join('C:\DATA\ComBERT\data_finetuning', 'train_{}.csv')

Dataset for classification task

In [6]:
df = pd.read_csv(tweet_filepath)
df.columns = ['raw_text', *df.columns[1:]]
df.head()

Unnamed: 0,raw_text,is_retweet,label,created_at,yyyymm
0,RT @curtmelonopoly: $AUPH If price stays in th...,True,human,2017-05-18 22:00:41,201705
1,$WFM $SFM $UNFI$NGVC Natural GrocersI like it ...,False,human,2017-05-18 22:01:00,201705
2,Check out $ATHN on ChartMill at https://t.co/N...,False,human,2017-05-18 22:01:25,201705
3,Check out $AR on ChartMill at https://t.co/zlX...,False,human,2017-05-18 22:01:25,201705
4,Check out $KMX on ChartMill at https://t.co/DX...,False,human,2017-05-18 22:01:25,201705


## Preprocessing
* RT @Account_Name: -> [RT]
* URL -> [URL] 
* @Account_Name -> ' '
* Ticker -> Company name (ex. $TSLA -> Tesla Inc)

In [27]:
removal_list =  "‘, ’, ◇, ‘, ”,  ’, ', ·, \“, ·, △, ●,  , ■, (, ), \", >>, `, /, -,∼,=,ㆍ<,>, .,?, !,【,】, …, ◆,%"
def preprocess_clean(sent):
    sent = sent.translate(str.maketrans(removal_list, ' '*len(removal_list)))
    sent = re.sub("\s+", " ", sent)
    sent = sent.lower()
    return sent

convert_ticker_to_name = dict(zip('$'+company_df.Ticker, company_df.Name))
def convert_ticker_with_cashtag_to_name(text):
    return re.sub(r'\$([a-zA-Z.-]+)', lambda m: convert_ticker_to_name.setdefault(m.group(0), m.group(0)), text.lower())

def preprocess_trim(sent):
    sent = re.sub("\s+", " ", sent)
    sent = sent.strip()
    return sent

rt_pattern = re.compile('RT @[a-zA-Z]*:')
url_pattern = re.compile("(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
account_pattern = re.compile('@[a-zA-Z1-9]*')
multiple_spaces_pattern = re.compile(' +', re.UNICODE)
def preprocess(text):
    text = re.sub(rt_pattern, '[RT]', text)
    text = re.sub(url_pattern, '[URL]', text)
    text = re.sub(account_pattern, ' ', text)
    text = re.sub(multiple_spaces_pattern, ' ', text)
    
    text = preprocess_clean(text)
    text = convert_ticker_with_cashtag_to_name(text)
    text = preprocess_trim(text)
    
    return text

company_df = pd.read_csv(company_filepath)
company_df.dropna(subset=['Ticker', 'Name'], inplace=True)
company_df['Ticker'] = company_df['Ticker'].apply(lambda x: x.lower())
company_df['Name'] = company_df['Name'].apply(lambda x: x.lower())
company_df

Unnamed: 0,CIK,Ticker,Name,Exchange
0,1750,air,aar corp,NYSE
1,1800,abt,abbott laboratories,NYSE
2,1961,wddd,worlds inc,OTC
3,2098,acu,acme united corp,NYSE
4,2178,ae,"adams resources & energy, inc.",NYSE
...,...,...,...,...
12052,1931691,mobv,mobiv acquisition corp,
12053,1933644,mdlvy,"medlive technology co., ltd./adr",
12054,1934064,osa,laaa merger corp.,
12055,1934945,tbmcu,trailblazer merger corp i,


In [28]:
df['text'] = df['raw_text'].progress_apply(lambda x: preprocess(x))
df.head()

100%|████████████████████████████████████████████████████████████████████████| 286399/286399 [00:36<00:00, 7828.63it/s]


Unnamed: 0,raw_text,is_retweet,label,created_at,yyyymm,text
0,RT @curtmelonopoly: $AUPH If price stays in th...,True,human,2017-05-18 22:00:41,201705,[rt] aurinia pharmaceuticals inc. if price sta...
1,$WFM $SFM $UNFI$NGVC Natural GrocersI like it ...,False,human,2017-05-18 22:01:00,201705,"$wfm sprouts farmers market, inc. united natur..."
2,Check out $ATHN on ChartMill at https://t.co/N...,False,human,2017-05-18 22:01:25,201705,check out $athn on chartmill at [url] #digital...
3,Check out $AR on ChartMill at https://t.co/zlX...,False,human,2017-05-18 22:01:25,201705,check out antero resources corp on chartmill a...
4,Check out $KMX on ChartMill at https://t.co/DX...,False,human,2017-05-18 22:01:25,201705,check out carmax inc on chartmill at [url] #di...


Preprocessing Example

In [29]:
row = df.iloc[-1]
print('{}\n=> {}\n'.format(row['raw_text'], row['text']))

row = df.iloc[-2]
print('{}\n=> {}'.format(row['raw_text'], row['text']))

$AAPL Can do this for a while w/o need for reinventing the wheel. They got the system of interconnected products/services @IsaacCheatham88
=> apple inc. can do this for a while w o need for reinventing the wheel they got the system of interconnected products services

RT @MindMakesMatter: $NVDA daily updatedDamn thing hasn't even blown yet!Recheck those last 2 quarters. if you wanna https://t.co/rIKVq…
=> [rt] nvidia corp daily updateddamn thing hasn t even blown yet recheck those last 2 quarters if you wanna [url]


## Constructing balanced train/test set

In [41]:
print('(201709) Before splitting, maximum number of rows for one class:', len(df[(df['label']=='bot')&(df['yyyymm']==201709)]))
print('(201709) Before splitting, maximum number of rows for one class:', len(df[(df['label']=='human')&(df['yyyymm']==201709)]))
print('(!201709) Before splitting, maximum number of rows for one class:', len(df[(df['label']=='bot')&(df['yyyymm']!=201709)]))
print('(!201709) Before splitting, maximum number of rows for one class:', len(df[(df['label']=='human')&(df['yyyymm']!=201709)]))

(201709) Before splitting, maximum number of rows for one class: 6992
(201709) Before splitting, maximum number of rows for one class: 18107
(!201709) Before splitting, maximum number of rows for one class: 41150
(!201709) Before splitting, maximum number of rows for one class: 220150


### Test dataset
* Produced in September 2017
* human:bot = 1:1

In [8]:
test_df = pd.concat([df[(df['yyyymm']==201709)&(df['label']==label)].sample(5000) for label in df['label'].unique()])
test_df = test_df.sample(frac=1).reset_index(drop=True) # shuffle

test_df.to_csv(save_test_filepath, index=False)
print('Created {}'.format(save_test_filepath))

Created /media/dmlab/My Passport/DATA/fin_tweet_spam/data/test_10000.csv


### Train dataset
* Produced in May, June, July, and August 2017

In [42]:
train_full_df = df[df['yyyymm']!=201709]

filepath = save_train_filepath_format.format(len(train_full_df))
train_full_df.to_csv(filepath, index=False)
print('Created {}'.format(filepath))

Created C:\DATA\ComBERT\data_finetuning\train_261300.csv
