-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Election 2020 Dataset #28
Conversation
This comment was marked as resolved.
This comment was marked as resolved.
election_2020/embeddings.py
Outdated
#fetch new batch of sample users | ||
TWEET_MAX = 50 | ||
TWEET_DELIMETER = "' || '" | ||
SAMPLE_SIZE = input("Please input a sample size:") or 100 |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
election_2020/embeddings.py
Outdated
LIMIT {SAMPLE_SIZE} | ||
''' | ||
|
||
sample = bq.query_to_df(sample_sql, verbose=False) |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
This comment was marked as resolved.
This comment was marked as resolved.
election_2020/embeddings.py
Outdated
user_table = f"{project_id}.{dataset_id}.users_sample_max50" | ||
sample.to_gbq(user_table,project_id=project_id, if_exists='append') | ||
|
||
|
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
election_2020/embeddings.py
Outdated
sample['latest_tweet_on'] = pd.to_datetime(sample['latest_tweet_on']) | ||
|
||
#append users to user sample table | ||
project_id = "tweet-research-shared" |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
election_2020/embeddings.py
Outdated
print("ENBEDDINGS:") | ||
texts = sample['tweet_texts'].tolist() | ||
embeddings = ai.get_embeddings_in_dynamic_batches(texts, batch_char_limit=15_000) | ||
sample_embeddings = sample.assign(openai_embeddings=embeddings) |
This comment was marked as outdated.
This comment was marked as outdated.
Sorry, something went wrong.
election_2020/embeddings.py
Outdated
|
||
#save the embeddings back to bigquery | ||
embeddings_table = f"{project_id}.{dataset_id}.openai_embeddings_users_sample_max50" | ||
sample_embeddings.to_gbq(embeddings_table, project_id=project_id, if_exists='append') |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
This comment was marked as resolved.
This comment was marked as resolved.
election_2020/intro.md
Outdated
@@ -0,0 +1,34 @@ | |||
## Setup | |||
|
|||
### Virtual Environment |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
app/BigQuery_service.py
Outdated
|
||
sql = ''' | ||
SELECT user_id | ||
FROM `tweet-research-shared.election_2020_transition_2021_combined.users_sample_max50_10000` |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
app/BigQuery_service.py
Outdated
import pandas as pd | ||
|
||
credentials = service_account.Credentials.from_service_account_file( | ||
'/Users/Joyce/Desktop/Project/openai-embeddings-2023/google-credentials.json') |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
app/BigQuery_service.py
Outdated
from google.oauth2 import service_account | ||
import pandas as pd | ||
|
||
credentials = service_account.Credentials.from_service_account_file( |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
app/BigQuery_service.py
Outdated
|
||
class BigQueryService(): | ||
def __init__(self): | ||
self.client = bigquery.Client.from_service_account_json('/Users/Joyce/Desktop/Project/openai-embeddings-2023/google-credentials.json') |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
app/__init__.py
Outdated
@@ -5,6 +5,7 @@ | |||
|
|||
DATA_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "data") | |||
RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "results") | |||
ELECT_FIRPATH = os.path.join(os.path.dirname(__file__), "..", "data","election") |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
election_2020/__init__.py
Outdated
import os | ||
import json | ||
|
||
DATA_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "data") |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
election_2020/intro.md
Outdated
|
||
make and save prediction with models from Cloud, for now use Logistic regression | ||
```sh | ||
python -m election_2020.model_prediction |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
election_2020/model_prediction.py
Outdated
# print(df.shape) #make sure we have read the file | ||
#else: | ||
# print(f"File not found: {filepath}") | ||
sql = '''SELECT * |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
election_2020/model_prediction.py
Outdated
model_preds = model.predict(X) | ||
column_name = f"pred_{y_col}" | ||
df[column_name] = model_preds | ||
|
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
election_2020/model_prediction.py
Outdated
# print('fail to download the model') | ||
|
||
model_dirname = "logistic_regression" | ||
for y_col in ["is_bot", "opinion_community", "is_toxic", "is_factual", "fourway_label"]: |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
election_2020/model_prediction.py
Outdated
|
||
print('SAVING:') | ||
#append predictions to predictions table | ||
project_id = "tweet-research-shared" |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
election_2020/model_prediction.py
Outdated
from app import DATA_DIRPATH, ELECT_FIRPATH | ||
|
||
bq = BigQueryService() | ||
|
This comment was marked as outdated.
This comment was marked as outdated.
Sorry, something went wrong.
requirements.txt
Outdated
@@ -15,19 +15,24 @@ kaleido # for exporting plotly images to png / html | |||
# embeddings: | |||
openai==0.28 # ok so there is now a 1.0 interface but we originally obtained using earlier API, so pinning that here | |||
|
|||
#google cloud: | |||
google.cloud | |||
pandas-gbq |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
requirements.txt
Outdated
|
||
# machine learning: | ||
scikit-learn #==1.3.0 | ||
#category_encoders | ||
#tensorflow | ||
xgboost==1.7.6 #==1.2.1 #==1.5.0 # 1.7.6 | ||
#xgboost==1.7.6 #==1.2.1 #==1.5.0 # 1.7.6 |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
Code Climate has analyzed commit e0ee53e and detected 0 issues on this pull request. View more on Code Climate. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good overall.
|
||
#print(df.shape) | ||
|
||
fourway_dict = { |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
This comment was marked as outdated.
This comment was marked as outdated.
Sorry, something went wrong.
@@ -0,0 +1,180 @@ | |||
|
|||
Use election_transaction_2020 dataset, get embeddings on both user level and tweet level, and apply the pre-trained classification models on the dataset. |
This comment was marked as resolved.
This comment was marked as resolved.
Sorry, something went wrong.
Fetch openAI embeddings for users and tweets from the Election 2020 + Transition combined dataset. Classify the users based on their embeddings.