<a href="https://colab.research.google.com/github/slz4025/twitter_latent_scams/blob/master/filterer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script filters out tweets that are not English or undefined, that are written by a verified author, and that are retweets.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_name = "covid_20200602" #@param {type: "string"}

In [None]:
data_dir = "drive/My Drive/latent_scams/data/"#@param {type: "string"}
processed_dir = "drive/My Drive/latent_scams/processed_data/"#@param {type: "string"}
data_file = "{}.json".format(file_name)
output_file = "filtered_{}.json".format(file_name)
tweet_file = "filtered_{}.tsv".format(file_name)

In [None]:
!unzip "$data_dir$file_name".zip -d "$data_dir$file_name"
!mv "$data_dir$file_name/$data_file" "$data_dir$data_file"

In [None]:
import json
import pandas as pd

In [None]:
def check_verified(tweet):
    return tweet["user"]["verified"]

def check_retweet(tweet):
    return "retweeted_status" in tweet

def check_english(tweet):
    return tweet["lang"] == "en" or tweet["lang"] is None

with open(processed_dir + output_file, "w+") as out:
    with open(data_dir + data_file) as contents:
        count = 0
        for line in contents:
            line = line.strip()
            if line == '': continue
            t = json.loads(line)
            verified = check_verified(t)
            retweet = check_retweet(t)
            eng = check_english(t)
            if not verified and not retweet and eng:
                out.write(line + "\n")
            if count % 1000 == 0: print("Processed ", count)
            count += 1

In [None]:
no_whitespace = str.maketrans({s: ' ' for s in string.whitespace})
def get_body(tweet):
    body = tweet["text"]
    if "extended_tweet" in tweet:
        body = tweet["extended_tweet"]["full_text"]
    body = body.translate(no_whitespace)
    return body

tweets = []
with open(processed_dir + output_file) as contents:
  for line in contents:
    line = line.strip()
    t = json.loads(line)
    tweet_id = t["id_str"]
    body = get_body(t)
    tweets.append({"tweet_id": tweet_id, "body": body})

tweet_pd = pd.DataFrame(tweets, columns=["tweet_id", "body"])
tweet_pd.to_csv(sep="\t", path_or_buf=processed_dir + tweet_file)