In [1]:
%cd ~/transuasion

import pandas as pd
import json
import seaborn as sns
import numpy as np
import re
from collections import Counter

from transuasion.utils.url_verb import get_url_verb_dict
from transuasion.utils.clean_tweet import clean_tweet
!wget -nc https://github.com/wordnik/wordlist/raw/main/wordlist-20210729.txt -P ./data
en = set(open('data/wordlist-20210729.txt').read().split())

/home/someshs/transuasion
File ‘./data/wordlist-20210729.txt’ already there; not retrieving.



In [2]:
df_pairs = pd.read_parquet('data/parallel_tweet_it0.parquet')
usernames = json.load(open('data/approved_usernames.json'))

In [3]:
df_pairs = df_pairs[(df_pairs['character_difference']>5) & (df_pairs['date_diff']<=45) & (df_pairs['similarity']>=0.5) & (df_pairs['media_x']=='') & (df_pairs['media_y']=='') & (df_pairs['username'].isin(usernames))]

In [4]:
url_column = df_pairs['url'].drop_duplicates()
url_verb, url_df = get_url_verb_dict(url_column, en)
json.dump(url_verb, open('url_verb.json', 'w'), indent=4)

In [5]:
url_df['netloc'].value_counts()

netloc
bit.ly                         57454
www.youtube.com                15124
buff.ly                         9741
twitter.com                     7842
www.liverpoolecho.co.uk         4870
                               ...  
www.merseytravel.gov.uk            1
www.liverpoolfairs.org.uk          1
www.liverpool.aluminate.net        1
learningonscreen.ac.uk             1
www.seamless.com                   1
Name: count, Length: 16142, dtype: int64

# Pair Simulation

In [6]:
PREFIX = """{} wrote a tweet.
{} Compare the engagement levels of two tweets: Tweet (A) and Tweet (B).
Determine which tweet will get a higher number of likes, answer (A) or (B).
Provide the ratio of likes for (A) and (B)
"""

INSTRUCTION = """Tweet (A) with masked mentions {}:
'''{}'''

Tweet (B) with masked mentions {}:
'''{}'''

"""

ANSWER = """{} will be liked {} times more"""

In [7]:
pairs_0 = df_pairs[(df_pairs['likes_x'] * df_pairs['likes_y']) == 0]
pairs_n0 = df_pairs[(df_pairs['likes_x'] * df_pairs['likes_y']) != 0]

pairs_00 = pairs_0[pairs_0['likes_x'] + pairs_0['likes_y']==0] #both zero
#pairs_0n0 = pairs_0[pairs_0['likes_x'] + pairs_0['likes_y']>0] #single zero

pairs_nen = pairs_n0[pairs_n0['likes_x'] == pairs_n0['likes_y']] #non zero equals
pairs_nnen = pairs_n0[pairs_n0['likes_x'] != pairs_n0['likes_y']] #non zero unequals

pairs_nnen_close = pairs_nnen[(pairs_nnen['likes_x']+pairs_nnen['likes_y']<10) & (pairs_nnen['likes_ratio']<2)] #unequal non zero-close small values
pairs_nnen_remain = pairs_nnen[~((pairs_nnen['likes_x']+pairs_nnen['likes_y']<10) & (pairs_nnen['likes_ratio']<2))] #unequal non zero large/far values

pairs_nen_close = pairs_nen[(pairs_nen['likes_x']+pairs_nen['likes_y']<10)] #equal non zero small values
pairs_nen_remain = pairs_nen[~((pairs_nen['likes_x']+pairs_nen['likes_y']<10))] #equal non zero large values

In [8]:
pairs_nen_remain['tag']="High Non-Zero Equal Pairs"
pairs_nen_close['tag']="Low Non-Zero Equal Pairs"
pairs_nnen_close['tag']="Low Non-Zero Un-Equal Pairs"
pairs_nnen_remain['tag']="High Non-Zero Un-Equal Pairs"
pairs_00['tag']="Zero Pairs"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pairs_nen_remain['tag']="High Non-Zero Equal Pairs"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pairs_nen_close['tag']="Low Non-Zero Equal Pairs"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pairs_nnen_close['tag']="Low Non-Zero Un-Equal Pairs"
A value is trying to be set on a copy of a slice 

In [9]:
pd.concat([pairs_nen_remain, pairs_nen_close, pairs_nnen_close, pairs_nnen_remain, pairs_00])['tag'].value_counts()

tag
Zero Pairs                      623666
High Non-Zero Un-Equal Pairs    521620
Low Non-Zero Un-Equal Pairs     224153
Low Non-Zero Equal Pairs        173191
High Non-Zero Equal Pairs        17622
Name: count, dtype: int64

In [10]:
pair_sim = pd.concat([
    pairs_nen_remain,
    pairs_nnen_remain,
    pairs_nen_close.sample(frac=0.3),
    pairs_nnen_close.sample(frac=0.3),
    pairs_00.sample(frac=0.1)
]).sample(frac=1)

In [11]:
r1 = (pair_sim['likes_x']+0.11) / (pair_sim['likes_y']+0.11).values
r2 = (pair_sim['likes_y']+0.11) / (pair_sim['likes_x']+0.11).values
pair_sim['likes_ratio'] = np.max(np.array([r1,r2]), axis=0)

In [12]:
pair_sim['masked_tweet_x'] = pair_sim['tweet_x'].apply(clean_tweet)
pair_sim['masked_tweet_y'] = pair_sim['tweet_y'].apply(clean_tweet)

In [13]:
pair_sim.apply(lambda x: ANSWER.format('(A)' if x.likes_x > x.likes_y else '(B)', np.round(x.likes_ratio, 2)),axis=1)

1779976      (B) will be liked 1.0 times more
11304550     (B) will be liked 1.0 times more
17305828    (A) will be liked 2.46 times more
23224027     (B) will be liked 1.0 times more
42102811    (B) will be liked 2.17 times more
                          ...                
34705521    (A) will be liked 1.31 times more
12977293     (B) will be liked 2.8 times more
33983343     (B) will be liked 1.0 times more
1467760      (B) will be liked 1.0 times more
26732647    (B) will be liked 1.47 times more
Length: 720812, dtype: object

In [14]:
rpair_sim = pair_sim[['id_x', 'id_y', 'url', 'likes_ratio']]
rpair_sim['instruction'] = pair_sim.apply(lambda x: PREFIX.format(x.username, url_verb.get(x.url, '')) + '\n' + INSTRUCTION.format(x.masked_tweet_x['usernames'], x.masked_tweet_x['cleaned_tweet'], x.masked_tweet_y['usernames'], x.masked_tweet_y['cleaned_tweet']), axis=1)
rpair_sim['answer'] = pair_sim.apply(lambda x: ANSWER.format('(A)' if x.likes_x > x.likes_y else '(B)', np.round(x.likes_ratio, 2)),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rpair_sim['instruction'] = pair_sim.apply(lambda x: PREFIX.format(x.username, url_verb.get(x.url, '')) + '\n' + INSTRUCTION.format(x.masked_tweet_x['usernames'], x.masked_tweet_x['cleaned_tweet'], x.masked_tweet_y['usernames'], x.masked_tweet_y['cleaned_tweet']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rpair_sim['answer'] = pair_sim.apply(lambda x: ANSWER.format('(A)' if x.likes_x > x.likes_y else '(B)', np.round(x.likes_ratio, 2)),axis=1)


In [15]:
pair_ids = set(rpair_sim['id_x']).union(set(rpair_sim['id_y']))

In [16]:
len(pair_ids)

543983

In [17]:
rpair_sim

Unnamed: 0,id_x,id_y,url,likes_ratio,instruction,answer
1779976,256413915595014144,256415543232434176,https://media-server.com/m/p/4tpzv3st,1.000000,nokianetworks wrote a tweet.\nThe tweet has a ...,(B) will be liked 1.0 times more
11304550,710225882150002688,715965199853944832,https://bit.ly/1KDGK5R,1.000000,HALcruises wrote a tweet.\nThe tweet has a web...,(B) will be liked 1.0 times more
17305828,1214607023427510274,1214611450481782785,https://buff.ly/37Fm3TK,2.459854,AmerIndependent wrote a tweet.\nThe tweet has ...,(A) will be liked 2.46 times more
23224027,661905865574780928,672868077990645761,https://bit.ly/1Hng7xK,1.000000,NHSLeadership wrote a tweet.\nThe tweet has a ...,(B) will be liked 1.0 times more
42102811,1604948529612574727,1612449118860754947,https://hostux.social/@fsf,2.174168,fsf wrote a tweet.\nThe tweet has a webpage li...,(B) will be liked 2.17 times more
...,...,...,...,...,...,...
34705521,887921866941370368,888040671743025152,https://po.st/HQGE7S,1.313301,Unilever wrote a tweet.\nThe tweet has a webpa...,(A) will be liked 1.31 times more
12977293,483524336415162368,484031616952852481,https://ibm.co/1pj9uGa,2.801802,IBMcloud wrote a tweet.\nThe tweet has a webpa...,(B) will be liked 2.8 times more
33983343,360807385142530049,370894316891541504,https://bit.ly/CSR-UPS,1.000000,UPS wrote a tweet.\n Compare the engagement le...,(B) will be liked 1.0 times more
1467760,190833362288644097,191925922067263488,https://bit.ly/mykFsy,1.000000,msPartner wrote a tweet.\n Compare the engagem...,(B) will be liked 1.0 times more


In [18]:
# rpair_sim.to_parquet('data/transuasion_it4_pair_sim.parquet')

In [19]:
# !aws s3 sync ./data s3://crawldatafromgcp/somesh/KPITranslation

(dryrun) upload: data/transuasion_it4_pair_sim.parquet to s3://crawldatafromgcp/somesh/KPITranslation/transuasion_it4_pair_sim.parquet
(dryrun) upload: data/tweets/dataset_dict.json to s3://crawldatafromgcp/somesh/KPITranslation/tweets/dataset_dict.json
(dryrun) upload: data/tweets/train/data-00000-of-00080.arrow to s3://crawldatafromgcp/somesh/KPITranslation/tweets/train/data-00000-of-00080.arrow
(dryrun) upload: data/tweets/train/data-00001-of-00080.arrow to s3://crawldatafromgcp/somesh/KPITranslation/tweets/train/data-00001-of-00080.arrow
(dryrun) upload: data/tweets/train/data-00002-of-00080.arrow to s3://crawldatafromgcp/somesh/KPITranslation/tweets/train/data-00002-of-00080.arrow
(dryrun) upload: data/tweets/train/data-00003-of-00080.arrow to s3://crawldatafromgcp/somesh/KPITranslation/tweets/train/data-00003-of-00080.arrow
(dryrun) upload: data/tweets/train/data-00004-of-00080.arrow to s3://crawldatafromgcp/somesh/KPITranslation/tweets/train/data-00004-of-00080.arrow
(dryrun) up

# Transuasion

# Single Simulation

In [20]:
# get tweets that have been unused before hand and get their behaviour