Election 2020 Dataset #28

s2t2 · 2024-01-19T16:14:58Z

Fetch openAI embeddings for users and tweets from the Election 2020 + Transition combined dataset. Classify the users based on their embeddings.

election_2020/embeddings.py

+#fetch new batch of sample users
+TWEET_MAX = 50
+TWEET_DELIMETER = "' || '"
+SAMPLE_SIZE = input("Please input a sample size:") or 100


election_2020/embeddings.py

+  LIMIT {SAMPLE_SIZE}
+'''
+
+sample = bq.query_to_df(sample_sql, verbose=False)


election_2020/embeddings.py

+user_table = f"{project_id}.{dataset_id}.users_sample_max50"
+sample.to_gbq(user_table,project_id=project_id, if_exists='append')
+
+


election_2020/embeddings.py

+sample['latest_tweet_on'] = pd.to_datetime(sample['latest_tweet_on'])
+
+#append users to user sample table
+project_id = "tweet-research-shared"


election_2020/embeddings.py

+print("ENBEDDINGS:")
+texts = sample['tweet_texts'].tolist()
+embeddings = ai.get_embeddings_in_dynamic_batches(texts, batch_char_limit=15_000)
+sample_embeddings = sample.assign(openai_embeddings=embeddings)


election_2020/embeddings.py

+
+#save the embeddings back to bigquery
+embeddings_table = f"{project_id}.{dataset_id}.openai_embeddings_users_sample_max50"
+sample_embeddings.to_gbq(embeddings_table, project_id=project_id, if_exists='append')


election_2020/intro.md

@@ -0,0 +1,34 @@
+## Setup
+
+### Virtual Environment


app/BigQuery_service.py

+
+    sql = '''
+    SELECT user_id
+    FROM `tweet-research-shared.election_2020_transition_2021_combined.users_sample_max50_10000` 


app/BigQuery_service.py

+import pandas as pd
+
+credentials = service_account.Credentials.from_service_account_file(
+'/Users/Joyce/Desktop/Project/openai-embeddings-2023/google-credentials.json')


app/BigQuery_service.py

+from google.oauth2 import service_account
+import pandas as pd
+
+credentials = service_account.Credentials.from_service_account_file(


app/BigQuery_service.py

+
+class BigQueryService():
+    def __init__(self):
+        self.client = bigquery.Client.from_service_account_json('/Users/Joyce/Desktop/Project/openai-embeddings-2023/google-credentials.json')


app/__init__.py

@@ -5,6 +5,7 @@

 DATA_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "data")
 RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "results")
+ELECT_FIRPATH = os.path.join(os.path.dirname(__file__), "..", "data","election")


election_2020/__init__.py

+import os
+import json
+
+DATA_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "data")


election_2020/intro.md

+
+make and save prediction with models from Cloud, for now use Logistic regression
+```sh
+python -m election_2020.model_prediction


election_2020/model_prediction.py

+#    print(df.shape) #make sure we have read the file
+#else:
+#    print(f"File not found: {filepath}")
+sql = '''SELECT * 


election_2020/model_prediction.py

+    model_preds = model.predict(X)
+    column_name = f"pred_{y_col}"
+    df[column_name] = model_preds
+


election_2020/model_prediction.py

+#    print('fail to download the model')
+
+model_dirname = "logistic_regression"
+for y_col in ["is_bot", "opinion_community", "is_toxic", "is_factual",  "fourway_label"]:


election_2020/model_prediction.py

+
+print('SAVING:')
+#append predictions to predictions table
+project_id = "tweet-research-shared"


election_2020/model_prediction.py

+from app import DATA_DIRPATH, ELECT_FIRPATH
+
+bq = BigQueryService()
+


requirements.txt

@@ -15,19 +15,24 @@ kaleido # for exporting plotly images to png / html
 # embeddings:
 openai==0.28 # ok so there is now a 1.0 interface but we originally obtained using earlier API, so pinning that here

+#google cloud:
+google.cloud
+pandas-gbq


requirements.txt


 # machine learning:
 scikit-learn #==1.3.0
 #category_encoders
 #tensorflow
-xgboost==1.7.6 #==1.2.1 #==1.5.0 # 1.7.6
+#xgboost==1.7.6 #==1.2.1 #==1.5.0 # 1.7.6


codeclimate · 2024-02-20T21:27:10Z

Code Climate has analyzed commit e0ee53e and detected 0 issues on this pull request.

View more on Code Climate.

s2t2

Looks good overall.

app/election2020_v2/predict_user.py

+
+    #print(df.shape)
+
+    fourway_dict = {


app/election2020_v2/README.md

@@ -0,0 +1,180 @@
+
+Use election_transaction_2020 dataset, get embeddings on both user level and tweet level, and apply the pre-trained classification models on the dataset.


JiazanShi added 2 commits December 22, 2023 14:48

update prediction for election_2020

d2b75e5

update embeddings and model_prediction

074faae

This comment was marked as resolved.

Sign in to view