In [None]:
# getting stock price intraday data from Alpha Vantage into 12 csv files
import requests
import time

base_url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol=TSLA&interval=1min&slice=year1month{}&apikey=XXXXXXX'

for month in range(1, 13):
    url = base_url.format(month)
    response = requests.get(url)
    with open(f'tsla_1min_{month}.csv', 'wb') as f:
        f.write(response.content)
    time.sleep(13)

In [None]:
# many csv to 1
import os
import glob
import pandas as pd

input_dir = 'Project/alphav TSLA 1min/april 2022 -2023'

input_files = glob.glob(os.path.join(input_dir, '*.csv'))

df = pd.concat([pd.read_csv(f) for f in input_files])

output_file = '2022-2023_1min.csv'
df.to_csv(output_file, index=False)

In [None]:
# getting ulrs of media files for AWS rekognition
apify_15_23 = pd.read_csv('apify_15_23_all.csv')

with open('ids_urls.csv', 'w') as f_ids:
    f_ids.write('id,url\n')
    for _, row in apify_15_23.iterrows():
        urls = [row[f'media/{i}/media_url'] for i in range(4)]
        urls = [url for url in urls if  isinstance(url, str)]
        id = row['id']
        for url in urls:
            f_ids.write(f'{id},{url}\n')

In [None]:
# AWS rekognition request -> to csv
import boto3
from urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

df_links = pd.read_csv('ids_urls.csv')

columns = ['ID', 'Link', 'Detect Labels', 'Detect Faces', 'Recognize Celebrities', 'Detect Text']
dtypes = {'ID': int, 'Link': str, 'Detect Labels': str, 'Detect Faces': str, 'Recognize Celebrities': str, 'Detect Text': str}
df = pd.DataFrame(columns=columns).astype(dtypes)

session = boto3.session.Session(aws_access_key_id='XXXXXXXXXXXXXXXXX')

rekognition_client = session.client(service_name='rekognition', region_name='eu-west-1')

for i in range(len(df_links)):
    row = df_links.iloc[i]
    link = row['url']
    print(link)

    response = requests.get(link, verify=False)
    imgbytes = response.content

    print('DETECT_LABELS')
    response = rekognition_client.detect_labels(Image={'Bytes': imgbytes}, MaxLabels=10, MinConfidence=90)
    labels = response['Labels']
    
    print('DETECT_FACES')
    response = rekognition_client.detect_faces(Image={'Bytes': imgbytes})
    faceDetails = response['FaceDetails']
    
    print('RECOGNIZE_CELEBRITIES')
    response = rekognition_client.recognize_celebrities(Image={'Bytes': imgbytes})
    celebrityFaces = response['CelebrityFaces']
    
    print('DETECT_TEXT')
    response = rekognition_client.detect_text(Image={'Bytes': imgbytes})
    textDetections = response['TextDetections']
    
    row_data = {'ID': row['id'], 'Link': link, 'Detect Labels': labels, 'Detect Faces': faceDetails,
                'Recognize Celebrities': celebrityFaces, 'Detect Text': textDetections}
    df = df.append(row_data, ignore_index=True)
    
    print('')

df.to_csv('rekognition_elon.csv',index=False)

In [None]:
# AWS rekognition results processing
df = pd.read_csv('rekognition_elon.csv')

# same celeb twice on picture (set not gonna work, 'cause we want to extract their face emotions too)
def remove_duplicates(records):
    unique_names = set()
    unique_records = []
    for record in records:
        name = record['Name']
        if name not in unique_names:
            unique_names.add(name)
            unique_records.append(record)
    return unique_records    

df['Labels'] = df['Detect Labels'].apply(lambda x: [d['Name'] for d in eval(x)])        
df['Unique Celeb'] = df['Recognize Celebrities'].apply(eval).apply(remove_duplicates)
df['Celeb_uniq'] = df['Unique Celeb'].apply(lambda x: [d['Name'] for d in x])
df['Emotion'] = df['Unique Celeb'].apply(lambda x: [d['Face']['Emotions'][0]['Type'] for d in x])
df['Smile'] = df['Unique Celeb'].apply(lambda x: [d['Face']['Smile']['Value'] for d in x])
df['Gender'] = df['Unique Celeb'].apply(lambda x: [d['KnownGender']['Type'] for d in x])


normalized_df = pd.DataFrame()
for _, row in df.iterrows():
    celeb_uniq = row['Celeb_uniq']
    emotions = row['Emotion']
    smiles = row['Smile']
    genders = row['Gender']
    for celeb, emotion, smile, gender in zip(celeb_uniq, emotions, smiles, genders):
        normalized_df = normalized_df.append({
            'Celeb_uniq': celeb,
            'Emotion': emotion,
            'Smile': smile,
            'Gender': gender,
            'Link': row['Link'],
            'ID': row['ID']
        }, ignore_index=True)
normalized_df.to_csv('celebs.csv',index=False)

main_label = pd.DataFrame({
    'Label': df['Labels'].apply(lambda x: x[0] if x else None),
    'Link':df['Link'],
    'ID': df['ID']
    })
main_label.to_csv('main label.csv',index=False)

In [None]:
# emotions / sentiment etc. in tweets
import tweetnlp

model = tweetnlp.load_model('topic_classification', multi_label=False)
df['topic'] = df['full_text'].apply(lambda x: model.topic(x)['label'])

irony = tweetnlp.load_model('irony')
df['irony'] = df['full_text'].apply(lambda x: irony.irony(x)['label'])

emotion = tweetnlp.load_model('emotion')
df['emotion'] = df['full_text'].apply(lambda x: emotion.emotion(x)['label'])

sent = tweetnlp.load_model('sentiment')
df['sentiment'] = df['full_text'].apply(lambda x: sent.sentiment(x)['label'])

off = tweetnlp.load_model('offensive')
df['offensive'] = df['full_text'].apply(lambda x: off.offensive(x)['label'])


# trying another model for emotions to compare
# Jochen Hartmann, "Emotion English DistilRoBERTa-base". https://huggingface.co/j-hartmann/emotion-english-distilroberta-base/, 2022.
# (7 emotions: anger 🤬, disgust 🤢, fear 😨, joy 😀, neutral 😐, sadness 😭, surprise 😲)
from transformers import pipeline
emotion = pipeline(model='j-hartmann/emotion-english-distilroberta-base')
df['emotion_hartmann'] = df['full_text'].apply(emotion)
df['emotion_hartmann_'] = df['emotion_hartmann'].apply(lambda x: x[0]['label'])
df['score_hartmann'] =  df['emotion_hartmann'].apply(lambda x: x[0]['score'])
df.drop('emotion_hartmann', axis=1, inplace=True)
df.rename(columns={'emotion_hartmann_': 'emotion_hartmann'}, inplace=True)

df.to_csv('tweets_sentiment.csv',index=False)

In [None]:
# tweets on Elon Musk Twitter Acquisition (July 2022)
# https://www.kaggle.com/datasets/aneeshtickoo/tweets-on-elon-musk-twitter-acquisition

# has a lot of spam, so removing tweets with only links, + delete duplicates as those are also spam/news headers
# 72k -> 44k tweets
import re

deal = pd.read_csv('Project/twitter_deal.csv')
deal = deal.dropna(axis=1, how='all')

deal['tweet'] = deal['tweet'].str.replace(r'\B#\w+\b', '')  # remove hashtags
deal['tweet'] = deal['tweet'].str.replace(r'http\S+|www\S+', '', flags=re.MULTILINE)  # remove links
deal['tweet'] = deal['tweet'].str.strip()

deal_clean = deal[deal['tweet'] != ''].copy() # drop empty tweets
deal_clean.reset_index(drop=True, inplace=True)

spam_pattern = r'extra \d+% off|save \$ with'
deal_clean = deal_clean[~deal_clean['tweet'].str.contains(spam_pattern)].copy()

deal_clean.drop_duplicates(subset=['tweet'], inplace=True)
deal_clean.reset_index(drop=True, inplace=True)

deal_clean.to_csv('deal_clean.csv',index=False)

# sentiments
deal_clean['sentiment'] = deal_clean['tweet'].apply(lambda x: sent.sentiment(x)['label'])
deal_clean['emotion'] = deal_clean['tweet'].apply(lambda x: emotion.emotion(x)['label'])
deal_clean['irony'] = deal_clean['tweet'].apply(lambda x: irony.irony(x)['label'])
deal_clean['offensive'] = deal_clean['tweet'].apply(lambda x: off.offensive(x)['label'])

# hartmann model had an error with 1 row, so we catch it
def process_tweet(tweet):
    try:
        return emotion(tweet)
    except RuntimeError:
        return None
    
deal_clean['emotion_hartmann'] = deal_clean['tweet'].apply(process_tweet)
deal_clean['emotion_hartmann_'] = deal_clean['emotion_hartmann'].apply(lambda x: x[0]['label'] if x else None)
deal_clean['score_hartmann'] = deal_clean['emotion_hartmann'].apply(lambda x: x[0]['score'] if x else None)
deal_clean.drop('emotion_hartmann', axis=1, inplace=True)
deal_clean.rename(columns={'emotion_hartmann_': 'emotion_hartmann'}, inplace=True)

deal_clean.to_csv('deal_sent.csv', index=False)