Exporting data to drive...

### BigQuery Service

In [1]:
from google.colab import auth

auth.authenticate_user()

In [2]:
#from google.auth import default
#creds, _ = default()
#print(creds)

In [3]:
from google.cloud import bigquery
from pandas import DataFrame, read_gbq

#PROJECT_ID = "tweet-collector-py"
PROJECT_ID = "tweet-research-shared"

class BigQueryService():
    def __init__(self, project_id=PROJECT_ID):
        self.project_id = project_id
        self.client = bigquery.Client(project=self.project_id)

    def execute_query(self, sql, verbose=True):
        if verbose == True:
            print(sql)
        job = self.client.query(sql)
        return job.result()

    #def query_to_df(self, sql, verbose=True):
    #    """high-level wrapper to return a DataFrame"""
    #    results = self.execute_query(sql, verbose=verbose)
    #    return DataFrame([dict(row) for row in results])

    def query_to_df(self, sql, verbose=True):
        """high-level wrapper to return a DataFrame"""
        if verbose == True:
            print(sql)
        # https://pandas.pydata.org/docs/reference/api/pandas.read_gbq.html#pandas-read-gbq
        return read_gbq(sql, project_id=self.project_id, progress_bar_type="tqdm_notebook")


In [4]:
bq = BigQueryService()
print(bq)

<__main__.BigQueryService object at 0x7c481e331ea0>


In [5]:
print("DATASETS:")
datasets = list(bq.client.list_datasets())
for ds in datasets:
    #print("...", ds.project, ds.dataset_id)
    print("...", ds.reference)

DATASETS:
... tweet-research-shared.analysis_2021
... tweet-research-shared.disinfo_2021
... tweet-research-shared.election_2020
... tweet-research-shared.election_2020_transition_2021_combined
... tweet-research-shared.f1_racing_2023
... tweet-research-shared.impeachment_2020
... tweet-research-shared.impeachment_2021
... tweet-research-shared.transition_2021
... tweet-research-shared.truth_2023
... tweet-research-shared.user_aak154
... tweet-research-shared.user_ejc128
... tweet-research-shared.user_jrd154
... tweet-research-shared.user_llj40
... tweet-research-shared.user_mjr300
... tweet-research-shared.user_yc986


# Usage

Fetch table of "original" timeline tweets (excludes rt, reply, quotes):

In [6]:
print("------------")
print("QUERY:")
sql = """
    SELECT user_id, status_id, status_text, geo, created_at, lookup_at
    FROM `tweet-research-shared.disinfo_2021.q_user_timeline_tweets_original`
"""

df = bq.query_to_df(sql, verbose=True)
df

------------
QUERY:

    SELECT user_id, status_id, status_text, geo, created_at, lookup_at
    FROM `tweet-research-shared.disinfo_2021.q_user_timeline_tweets_original`



Downloading:   0%|          |

Unnamed: 0,user_id,status_id,status_text,geo,created_at,lookup_at
0,5878912,1290760850760818691,Radioactive Great white sharks swim among us a...,,2020-08-04 21:25:42+00:00,2021-03-25 04:52:55+00:00
1,5878912,1290995554739134466,#SaintedAnon on the #BeirutExplosion https:/...,,2020-08-05 12:58:20+00:00,2021-03-25 04:52:55+00:00
2,5878912,1291003614400413696,Is the #MandelaEffect proof we are now living ...,,2020-08-05 13:30:22+00:00,2021-03-25 04:52:55+00:00
3,5878912,1291071358638514176,To mark the Hiroshima-Nagasaki 75th anniversar...,,2020-08-05 17:59:33+00:00,2021-03-25 04:52:55+00:00
4,5878912,1291145303031087105,@VecchioCal https://t.co/3SupdrXEp0,,2020-08-05 22:53:23+00:00,2021-03-25 04:52:55+00:00
...,...,...,...,...,...,...
858445,1240979986716164096,1350907926135906312,Future proves past https://t.co/YDfbTrb0eE,,2021-01-17 20:48:42+00:00,2021-03-25 09:01:03+00:00
858446,1240979986716164096,1350986931866398726,From last year spring 2020. https://t.co/ODNj...,,2021-01-18 02:02:39+00:00,2021-03-25 09:01:03+00:00
858447,1240979986716164096,1351605098972131328,No more cultural appropriations people . Cmon...,,2021-01-19 18:59:01+00:00,2021-03-25 09:01:03+00:00
858448,1240979986716164096,1353034424099733506,It’s up to the parents to raise em right now. ...,,2021-01-23 17:38:39+00:00,2021-03-25 09:01:03+00:00


In [7]:
df.columns.tolist()

['user_id', 'status_id', 'status_text', 'geo', 'created_at', 'lookup_at']

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858450 entries, 0 to 858449
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype              
---  ------       --------------   -----              
 0   user_id      858450 non-null  Int64              
 1   status_id    858450 non-null  Int64              
 2   status_text  858450 non-null  object             
 3   geo          546332 non-null  object             
 4   created_at   858450 non-null  datetime64[ns, UTC]
 5   lookup_at    858450 non-null  datetime64[ns, UTC]
dtypes: Int64(2), datetime64[ns, UTC](2), object(2)
memory usage: 40.9+ MB


In [8]:
df.dtypes

user_id                      Int64
status_id                    Int64
status_text                 object
geo                         object
created_at     datetime64[ns, UTC]
lookup_at      datetime64[ns, UTC]
dtype: object

In [12]:
df.iloc[0]["status_text"]

'Radioactive Great white sharks swim among us at San Onofre https://t.co/Kmdi25K4is'

In [15]:
users_pivot = df.groupby("user_id")["status_id"].count().sort_values(ascending=False)
users_pivot

user_id
22778877               3250
1124154425457516544    3201
368563306              3160
62613436               3153
1017455405453234176    3052
                       ... 
884372312992227329        1
1189184489517248513       1
1076881762264010752       1
1189907538579513344       1
999734366673690624        1
Name: status_id, Length: 4493, dtype: int64

### Exporting to Drive

In [16]:
import os
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [17]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'
os.path.isdir(DIRPATH)

/content/drive/MyDrive/Research/DS Research Shared 2024


True

In [19]:
DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Disinformation 2021 Embeddings", "data")
os.path.isdir(DATA_DIRPATH)

True

In [None]:
csv_filepath = os.path.join(DATA_DIRPATH, "q_user_timeline_tweets_original.csv.gz")

df.to_csv(csv_filepath, index=False, compression="gzip")