In [25]:
import pandas as pd
import re
import numpy as np

##  clean up tweets

In [94]:
def clean_tweet(tweet, allow_new_lines = False):
    tweet = tweet.replace('&amp;', '&')
    tweet = tweet.replace('&lt;', '<')
    tweet = tweet.replace('&gt;', '>')
    bad_start = ['http:', 'https:']
    for w in bad_start:
        tweet = re.sub(f" {w}\\S+", "", tweet)      # removes white space before url
        tweet = re.sub(f"{w}\\S+ ", "", tweet)      # in case a tweet starts with a url
        tweet = re.sub(f"\n{w}\\S+ ", "", tweet)    # in case the url is on a new line
        tweet = re.sub(f"\n{w}\\S+", "", tweet)     # in case the url is alone on a new line
        tweet = re.sub(f"{w}\\S+", "", tweet)       # any other case?
    tweet = re.sub(' +', ' ', tweet)                # replace multiple spaces with one space
    if not allow_new_lines:                         # TODO: predictions seem better without new lines
        tweet = ' '.join(tweet.split("\n"))
    return tweet.strip()


def drop_mentions(tweet):
    words = tweet.split(" ")
    return " ".join([w for w in words if not w.startswith("@")]).strip()

def get_length(tweet):
    return len(tweet.split(" "))

def boring_tweet(tweet, min_non_boring_words = 5):
    boring_stuff = ['http', '@', '#']
    not_boring_words = len([None for w in tweet.split() if all(bs not in w.lower() for bs in boring_stuff)])
    return not_boring_words < min_non_boring_words

## ancil data

In [23]:
data = pd.read_csv("final_challenge_data.csv")
print(data.shape)
data["text_cleaned_2"] = data.text.apply(clean_tweet)
data["words"] = data.text_cleaned_2.apply(lambda w: len(w.split(" ")))
data["boring"] = data.text_cleaned_2.apply(boring_tweet)
keep = (data.words>=10) & (~data.boring)
data = data.loc[keep]
print(data.shape)


  exec(code_obj, self.user_global_ns, self.user_ns)


(16897, 42)
(12600, 45)


In [22]:
(data.words>=10).mean(), data.boring.mean()

(0.7632715866721903, 0.13351482511688464)

In [47]:
train_mask = np.random.rand(len(data))<.9
test_mask = ~train_mask

In [48]:
data.loc[train_mask,"text_cleaned_2"].to_csv("temp_train.csv", index = False)
data.loc[test_mask,"text_cleaned_2"].to_csv("temp_test.csv", index = False)



### make features

In [76]:
data_test = pd.read_csv("temp_test.csv")

In [117]:
from transformers import AutoModelForCausalLM, AutoTokenizer


In [2]:
model_name = "gpt2-medium"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token 

In [5]:
tokenizer("hello",add_special_tokens=True)

{'input_ids': [31373], 'attention_mask': [1]}

In [86]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token 
data_test = pd.read_csv("temp_test.csv")
texts = tokenizer.eos_token +  tokenizer.eos_token.join(data_test.iloc[:,0]) +tokenizer.eos_token
features = tokenizer(texts, stride = 4, max_length=128, return_overflowing_tokens=True, truncation=True, return_tensors="pt", padding=True)

In [73]:
tokenizer.eos_token_id

50256

In [87]:
features["input_ids"].shape

torch.Size([491, 128])

### download misinfo tweets

In [93]:
data_ids = pd.read_csv("VaxMisinfoData.csv")
data_ids

Unnamed: 0,id,is_misinfo
0,1344795424855642112,0
1,1344794858133860353,0
2,1344794822691983360,0
3,1344794752819077123,1
4,1344792070507134977,0
...,...,...
15068,1413087751474397186,0
15069,1413087030578401283,0
15070,1413085793397186565,1
15071,1413085519710363648,0


In [31]:
print(data.shape)
data["text_cleaned_2"] = data.text.apply(clean_tweet)
data["words"] = data.text_cleaned_2.apply(lambda w: len(w.split(" ")))
data["boring"] = data.text_cleaned_2.apply(boring_tweet)
keep = (data.words>=10) & (~data.boring)
data = data.loc[keep]
print(data.shape)



0    9322
1    5751
Name: is_misinfo, dtype: int64

### download tweets

In [37]:
CONSUMER_KEY="AUqvCh7D2XNvEC018izM039pt"
CONSUMER_SECRET="ipdhy74DGePlhIVjaDTtiHz2alGOMMuuyolIptbhaAzVLTO9yi"
ACCESS_TOKEN="71959189-iKh7iNjKHS5KbTnD77FRSdDf5DdGIt6ciKH9dxp41"
ACCESS_TOKEN_SECRET="DW7ZhRZYIqd0D8rbTvE2Y3yQv0jJ55ezivAY1yNebNLMv"

In [38]:
import tweepy

CONSUMER_KEY="AUqvCh7D2XNvEC018izM039pt"
CONSUMER_SECRET="ipdhy74DGePlhIVjaDTtiHz2alGOMMuuyolIptbhaAzVLTO9yi"
ACCESS_TOKEN="71959189-iKh7iNjKHS5KbTnD77FRSdDf5DdGIt6ciKH9dxp41"
ACCESS_TOKEN_SECRET="DW7ZhRZYIqd0D8rbTvE2Y3yQv0jJ55ezivAY1yNebNLMv"

client = tweepy.Client(
    consumer_key=CONSUMER_KEY,
    consumer_secret=CONSUMER_SECRET,
    access_token=ACCESS_TOKEN,
    access_token_secret=ACCESS_TOKEN_SECRET
)

In [82]:
ids =data["id"].astype(str).values

In [81]:
import tqdm.auto as tqdm

In [86]:
batch_size = 100
max_per_interval = 900
data = []
num_rs = 0
for i in tqdm.trange(0,len(ids),batch_size):
    batch_ids = ",".join(ids[i:i+batch_size])
    out = client.get_tweets(batch_ids, tweet_fields="context_annotations", user_auth=True)
    data.extend([dict(o) for o in out.data])
    num_rs += 1
    if num_rs > max_per_interval:
        break
    # break

  0%|          | 0/151 [00:00<?, ?it/s]

In [87]:
len(data)

12670

In [89]:
data_tweets = pd.DataFrame(data)

In [102]:
data_tweets.to_json("tweets_vaccine.jsonl",orient="records",lines=True)
data_tweets.to_csv("tweets_vaccine.csv", index=False)

In [97]:
data_tweets.dtypes

context_annotations    object
id                      int64
text                   object
withheld               object
dtype: object

In [98]:
data_tweets["is_misinfo"] =  data_ids.set_index('id').loc[data_tweets['id'].values,"is_misinfo"].values

In [100]:
data_tweets.is_misinfo.value_counts()

0    8277
1    4393
Name: is_misinfo, dtype: int64

## make dataset for training 

In [106]:
data_vaccine = pd.read_csv("tweets_vaccine.csv")

In [108]:
data_vaccine["text"] = data_vaccine["text"].apply(clean_tweet)

In [111]:
prompts = ["pro vaccine: ", "anti vaccine: "]

In [114]:
data_vaccine["text_with_prompt"] = [prompts[t.is_misinfo] + t.text for i,t in data_vaccine.T.items()]

In [116]:
train_mask = np.random.rand(len(data_vaccine))<.9
test_mask = ~train_mask

data_vaccine.loc[train_mask,["text_with_prompt", "text","is_misinfo"]].to_csv("vaccine_train.csv", index = False)
data_vaccine.loc[test_mask,["text_with_prompt", "text","is_misinfo"]].to_csv("vaccine_test.csv", index = False)



### download more data

In [1]:
# def download_file(file_name):
import tweepy
import tqdm.auto as tqdm
import pandas as pd
import time


In [2]:
time.time()

1649251528.6180978

In [148]:
pd.read_csv("ANTiVax/VaccineTweets/Apr_wk1.csv").shape

(472092, 1)

In [4]:


def download_file(file_name=None):
    ids =pd.read_csv(file_name)["id"].astype(str).values
    
    
    CONSUMER_KEY="AUqvCh7D2XNvEC018izM039pt"
    CONSUMER_SECRET="ipdhy74DGePlhIVjaDTtiHz2alGOMMuuyolIptbhaAzVLTO9yi"
    ACCESS_TOKEN="71959189-iKh7iNjKHS5KbTnD77FRSdDf5DdGIt6ciKH9dxp41"
    ACCESS_TOKEN_SECRET="DW7ZhRZYIqd0D8rbTvE2Y3yQv0jJ55ezivAY1yNebNLMv"

    client = tweepy.Client(
        consumer_key=CONSUMER_KEY,
        consumer_secret=CONSUMER_SECRET,
        access_token=ACCESS_TOKEN,
        access_token_secret=ACCESS_TOKEN_SECRET
    )
    
    
    batch_size = 100
    max_per_interval = 890
    max_time_per_interval_seconds = 16*60
    data = []
    num_requests = 0
    start_time = time.time()
    
    for i in tqdm.trange(0,len(ids),batch_size):
        
        
        
        if num_requests == max_per_interval:
            elapsed_time = time.time() - start_time
            sleep_time = max_time_per_interval_seconds - elapsed_time
            if sleep_time>0:
                print(f"sleeping for {elapsed_time}")
                time.sleep(sleep_time)
            start_time = time.time()
            num_requests = 0
        
        
        batch_ids = ",".join(ids[i:i+batch_size])
        out = client.get_tweets(batch_ids, tweet_fields="context_annotations", user_auth=True)
        data.extend([dict(o) for o in out.data])
        num_requests += 1
        
    pd.DataFrame(data).to_json(f"{file_name}.jsonl",orient="records",lines=True)

        

download_file("ANTiVax/VaccineTweets/Dec_wk3.csv")

  0%|          | 0/6528 [00:00<?, ?it/s]

sleeping for 417.63304591178894
sleeping for 419.9492437839508
sleeping for 404.5091998577118
sleeping for 387.8233082294464
sleeping for 394.14096426963806
sleeping for 388.0395293235779
sleeping for 395.15634298324585


### antivax dataset



In [31]:
import glob
files = glob.glob("avax-tweets-dataset/streaming-tweetids/*/*.txt")
target = 500e3
sample_per_file = int(target/len(files))
ids = []
for f in tqdm.tqdm(files):
    try:
        ids.extend(pd.read_csv(f,header=None).sample(sample_per_file).iloc[:,0].values.tolist())
    except:
        continue

  0%|          | 0/9428 [00:00<?, ?it/s]

In [65]:
temp = pd.DataFrame(dict(id = ids))#.to_csv("avax-tweets-dataset_sample500k.csv", index = False)
temp = temp.loc[temp["id"].astype(str).str.len()==19]
temp.to_csv("avax-tweets-dataset_sample500k.csv", index = False)

In [66]:
download_file("avax-tweets-dataset_sample500k.csv")

  0%|          | 0/4995 [00:00<?, ?it/s]

sleeping for 265.8859214782715
sleeping for 389.2414300441742
sleeping for 397.12471747398376
sleeping for 413.21429109573364
sleeping for 400.4697802066803


In [85]:
temp = pd.read_json("avax-tweets-dataset_sample500k.csv.jsonl",lines = True)
text = temp.text.str.replace("RT ","").drop_duplicates()
(~text.apply(boring_tweet)).sum()

173858

In [93]:


text = text.apply(drop_mentions)
text.apply(get_length).describe()

count    198400.000000
mean         21.008266
std          13.240846
min           1.000000
25%          12.000000
50%          19.000000
75%          28.000000
max         142.000000
Name: text, dtype: float64

### anti climate data

In [26]:
temp = pd.read_csv("climate_id.txt.03",header=None).iloc[:,0].sample(int(400e3)).values.tolist()
pd.DataFrame(dict(id = temp)).to_csv("climate-dataset_sample400k.csv", index = False)

In [27]:
download_file("climate-dataset_sample400k.csv")

  0%|          | 0/4000 [00:00<?, ?it/s]

sleeping for 427.3742322921753
sleeping for 423.9734694957733
sleeping for 434.57163286209106
sleeping for 501.6383218765259


In [81]:
temp = pd.read_json("climate-dataset_sample400k.csv.jsonl",lines = True)
text = temp.text.str.replace("RT ","").drop_duplicates()
(~text.apply(boring_tweet)).sum()

130587

In [71]:
#### only keep anti climate stuff
keywords = "#climatechangehoax,#climatedeniers,#climatechangeisfalse,#globalwarminghoax,#climatechangenotreal,hoax".split(",")
keywords

['#climatechangehoax',
 '#climatedeniers',
 '#climatechangeisfalse',
 '#globalwarminghoax',
 '#climatechangenotreal',
 'hoax']

In [83]:
for w in keywords:
    print(w,text.str.contains(w,case = False,regex=False).sum())

#climatechangehoax 47
#climatedeniers 34
#climatechangeisfalse 0
#globalwarminghoax 18
#climatechangenotreal 0
hoax 1056


In [77]:
text.sample(10)

153841    @SkyNewsBreak Wish governments were as quick t...
208795    @MhdiLmn: Sur notre planète, 100 entreprises p...
152857    @artbylynettag: Tony Abbott seriously needs to...
30580     @Tpopularfront: Last time I looked, Jeremy Cor...
92265     @GuardianHeather: NEW - Jeremy Corbyn has *dec...
108439    From @rollingstone - — California Gov. Jerry B...
12140     @CECHR_UoD: Hurricanes\nA perfect storm of cha...
61732     The US is a hotbed of climate science denial w...
220690    @PennsylvaniaDEP: .@GovernorTomWolf announces ...
32763     Folks living in the West SGV, East LA, and NEL...
Name: text, dtype: object

In [1]:
### save just the tweet ids

In [2]:
import pandas as pd

In [8]:
pd.read_csv("datasets/graphika.csv").loc[:,["id"]].to_csv("datasets/covid_ids.csv", index = False)

  exec(code_obj, self.user_global_ns, self.user_ns)
