In [1]:
import pandas as pd
import numpy as np

In [2]:
train_split = 50
split_train = train_split/100
train_file_name = "./dataset_{}/train_v2_{}.csv".format(train_split,train_split) 
test_file_name = "./dataset_{}/test_v2_{}.csv".format(train_split,train_split) 
vocab_file_name =  "./dataset_{}/vocab_v2_{}.csv".format(train_split,train_split)
print(split_train, train_file_name, test_file_name)

0.5 ./dataset_50/train_v2_50.csv ./dataset_50/test_v2_50.csv


## Creating labels and max_retweets

In [3]:
df = pd.read_csv("../../data/Organic_extended_finalv4.csv", sep="|", index_col=0)
df.columns

Index(['tweet_id', 'created_time', 'count', '1', '2', '3', '4', '5', '6',
       'user_id', 'screen_name', 'url', 'follower_count', 'title', 'content',
       'expanded_url', 'title_len', 'content_len'],
      dtype='object')

In [4]:
possible_cols = {str(x) for x in range(1,101)}
actual_cols = set(df.columns).intersection(possible_cols)
print(actual_cols)
df['max_retweets']= df[actual_cols].max(axis=1)

{'4', '1', '5', '6', '2', '3'}


In [5]:
# quad multiple of 100
df['label_log_10'] = 0 
df.loc[df['max_retweets']>=11,'label_log_10'] = 1
df.loc[df['max_retweets']>=101,'label_log_10'] = 2
df.loc[df['max_retweets']>=1001,'label_log_10'] = 3


In [6]:
mean_dataset = round(df['max_retweets'].mean())
df['label_mean'] = 0
df.loc[df['max_retweets']>=mean_dataset,'label_mean'] = 1

median_dataset = round(df['max_retweets'].median())
df['label_median'] = 0
df.loc[df['max_retweets']>=median_dataset,'label_median'] = 1

df[['screen_name', 'title', 'max_retweets', 'label_mean', 'label_median','label_log_10']].head(10)

Unnamed: 0,screen_name,title,max_retweets,label_mean,label_median,label_log_10
0,Los Angeles Times,‘All Black Lives Matter’ painted on Hollywood ...,466.0,1,1,2
1,ABC News,Millions in lawsuit settlements are another hi...,163.0,1,1,2
2,CNN,Woman becomes first observant Sikh to graduate...,936.0,1,1,2
3,The New York Times,"As Social Distancing Wanes, Cuomo Warns of Ano...",2381.0,1,1,3
4,Los Angeles Times,They lost loved ones to police violence. Georg...,267.0,1,1,2
5,NPR,Boston Mayor Declares Racism A Public Health C...,938.0,1,1,2
6,Los Angeles Times,They lost loved ones to police violence. Georg...,33.0,0,0,1
7,Los Angeles Times,They lost loved ones to police violence. Georg...,72.0,0,1,1
8,Los Angeles Times,They lost loved ones to police violence. Georg...,48.0,0,1,1
9,Los Angeles Times,They lost loved ones to police violence. Georg...,70.0,0,1,1


In [7]:
quantile_25, quantile_50, quantile_75 = df['max_retweets'].quantile(q=0.25), df['max_retweets'].quantile(q=0.5), df['max_retweets'].quantile(q=0.75)
print(quantile_25, quantile_50, quantile_75)
df['label_quantile'] = 0
df.loc[df['max_retweets']>=quantile_25,'label_quantile'] = 1
df.loc[df['max_retweets']>=quantile_50,'label_quantile'] = 2
df.loc[df['max_retweets']>=quantile_75,'label_quantile'] = 3
df[['screen_name', 'title', 'max_retweets', 'label_quantile']].head(10)

23.0 48.0 108.0


Unnamed: 0,screen_name,title,max_retweets,label_quantile
0,Los Angeles Times,‘All Black Lives Matter’ painted on Hollywood ...,466.0,3
1,ABC News,Millions in lawsuit settlements are another hi...,163.0,3
2,CNN,Woman becomes first observant Sikh to graduate...,936.0,3
3,The New York Times,"As Social Distancing Wanes, Cuomo Warns of Ano...",2381.0,3
4,Los Angeles Times,They lost loved ones to police violence. Georg...,267.0,3
5,NPR,Boston Mayor Declares Racism A Public Health C...,938.0,3
6,Los Angeles Times,They lost loved ones to police violence. Georg...,33.0,1
7,Los Angeles Times,They lost loved ones to police violence. Georg...,72.0,2
8,Los Angeles Times,They lost loved ones to police violence. Georg...,48.0,2
9,Los Angeles Times,They lost loved ones to police violence. Georg...,70.0,2


In [8]:
#binary classification
df['label_grouped_median'] = 0
df['grouped_median'] = df.groupby('user_id')['max_retweets'].transform('median')
df.loc[df['max_retweets']>=df['grouped_median'],'label_grouped_median'] = 1
df[['user_id','max_retweets','grouped_median','label_grouped_median']].head(15)

Unnamed: 0,user_id,max_retweets,grouped_median,label_grouped_median
0,16664681.0,466.0,18.0,1
1,28785486.0,163.0,73.0,1
2,759251.0,936.0,143.0,1
3,807095.0,2381.0,109.0,1
4,16664681.0,267.0,18.0,1
5,5392522.0,938.0,131.0,1
6,16664681.0,33.0,18.0,1
7,16664681.0,72.0,18.0,1
8,16664681.0,48.0,18.0,1
9,16664681.0,70.0,18.0,1


## Creating a training and a validation split
Creating a split.

In [9]:
train = df.sample(frac=split_train, random_state=10)
test = df.drop(train.index)

In [10]:
train = train.reset_index(drop=True)
train.head()

Unnamed: 0,tweet_id,created_time,count,1,2,3,4,5,6,user_id,...,expanded_url,title_len,content_len,max_retweets,label_log_10,label_mean,label_median,label_quantile,label_grouped_median,grouped_median
0,1275494342371704833,Tue Jun 23 18:22:03 +0000 2020,8,22,23.0,23.0,23.0,23.0,23.0,28785486.0,...,https://abcnews.go.com/Politics/note-progressi...,10,1394,23.0,1,0,0,1,0,73.0
1,1281351187716083714,Thu Jul 09 22:15:04 +0000 2020,11,389,394.0,402.0,402.0,402.0,402.0,3108351.0,...,https://www.wsj.com/articles/a-wnba-team-has-l...,13,80,402.0,2,1,1,3,1,35.0
2,1279756925605904385,Sun Jul 05 12:40:02 +0000 2020,9,257,275.0,275.0,275.0,275.0,275.0,807095.0,...,https://www.nytimes.com/2020/07/05/world/middl...,6,206,275.0,2,1,1,3,1,109.0
3,1282661055697575936,Mon Jul 13 13:00:01 +0000 2020,8,12,12.0,12.0,12.0,12.0,12.0,1652541.0,...,https://uk.reuters.com/article/uk-soccer-engla...,7,509,12.0,1,0,0,0,0,32.0
4,1273105632116310017,Wed Jun 17 04:10:10 +0000 2020,14,394,399.0,399.0,399.0,399.0,400.0,14511951.0,...,https://www.huffpost.com/entry/tribes-covid-re...,11,671,400.0,2,1,1,3,1,35.0


In [11]:
test = test.reset_index(drop=True)
test.head()

Unnamed: 0,tweet_id,created_time,count,1,2,3,4,5,6,user_id,...,expanded_url,title_len,content_len,max_retweets,label_log_10,label_mean,label_median,label_quantile,label_grouped_median,grouped_median
0,1272216897237516289,2020-06-14 17:18:39+00:00,17,163,163.0,163.0,163.0,162.0,162.0,28785486.0,...,https://abcnews.go.com/US/millions-lawsuit-set...,14,1766,163.0,2,1,1,3,1,73.0
1,1272220034065186817,2020-06-14 17:31:07+00:00,17,910,927.0,929.0,933.0,934.0,936.0,759251.0,...,https://www.cnn.com/2020/06/13/us/first-sikh-w...,15,190,936.0,2,1,1,3,1,143.0
2,1272219784743202816,2020-06-14 17:30:08+00:00,17,2352,2377.0,2381.0,2378.0,2376.0,2373.0,807095.0,...,https://www.nytimes.com/2020/06/14/world/coron...,9,4174,2381.0,3,1,1,3,1,109.0
3,1272220746014572545,2020-06-14 17:33:57+00:00,17,241,267.0,267.0,267.0,267.0,267.0,16664681.0,...,https://www.latimes.com/california/story/2020-...,16,2826,267.0,2,1,1,3,1,18.0
4,1272220892379004929,2020-06-14 17:34:32+00:00,17,30,33.0,33.0,33.0,33.0,33.0,16664681.0,...,https://www.latimes.com/california/story/2020-...,16,2826,33.0,1,0,0,1,1,18.0


## Metrics on the train and test

In [12]:
print(len(train))
# train.groupby(['screen_name', 'label'])['max_retweets'].agg(['count', 'median','mean', ])

18928


In [13]:
print(len(test))
# test.groupby(['screen_name', 'label'])['label'].agg(['count', 'median'])

18929


In [14]:
train.to_csv(train_file_name,sep="|")
test.to_csv(test_file_name,sep="|")
# train.to_csv("train.tsv",sep="\t")
# test.to_csv("test.tsv",sep="\t")

## Creating Vocabulary

In [15]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from collections import Counter

In [16]:
df = pd.read_csv(train_file_name,sep="|", index_col=0)
stopwords = stopwords.words('english')
df.columns, stopwords[:5]

(Index(['tweet_id', 'created_time', 'count', '1', '2', '3', '4', '5', '6',
        'user_id', 'screen_name', 'url', 'follower_count', 'title', 'content',
        'expanded_url', 'title_len', 'content_len', 'max_retweets',
        'label_log_10', 'label_mean', 'label_median', 'label_quantile',
        'label_grouped_median', 'grouped_median'],
       dtype='object'),
 ['i', 'me', 'my', 'myself', 'we'])

In [17]:
# Remove stop words
# remove punctuations and numbers
# remove single alphabets
def clean_text(text):
    text = " ".join([word for word in text.split() if word not in stopwords])
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct_num = regex.sub(" ", text.lower()) 
    # Removing the odd apostrophes
    tokens = [token for token in nopunct_num.split() if len(token)>=2 and token not in stopwords]
    return tokens

In [18]:
counter = Counter()
tokenizer = {"UNK":0}
def update_counter(text):
    counter.update(clean_text(text))
df['title'].apply(update_counter)
print(counter.most_common(5))
vocab = list(counter)
print(vocab[:5],len(vocab))

[('coronavirus', 2326), ('trump', 1890), ('covid', 1471), ('new', 1354), ('says', 1205)]
['note', 'progressive', 'challengers', 'rewrite', 'rules'] 14575


In [19]:
with open(vocab_file_name,"w+") as file:
    file.write("\n".join(vocab))