## Load Data

In [1]:
import io
import os
import csv
import json
import zipfile
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/FML Project')

## Load Tweets

In [3]:
df_tweet = pd.read_csv(sys.path[-1] + '/Data/df_tweet.csv')
df_tweet['target_date'] = pd.to_datetime(df_tweet['target_date'])
df_tweet.head()

Unnamed: 0,stock,target_date,tweet
0,VZ,2014-10-14,"$ gdp news : "" actives on open AT_USER $ aapl ..."
1,VZ,2015-06-20,rt AT_USER psw / seeking alpha june trade revi...
2,VZ,2015-02-10,weekly s & p100 stocks performance $ fcx $ dis...
3,VZ,2015-05-07,tip 4 how to identify a hot sector ? URL stock...
4,VZ,2014-08-20,"overpriced ? if you have a superior product , ..."


## Load Images and Labels

In [4]:
df_image = pd.read_csv(sys.path[-1] + '/Data/df_image.csv')
df_image['target_date'] = pd.to_datetime(df_image['target_date'])
df_image.head()

Unnamed: 0,stock,target_date,image_name,label
0,PCG,2015-03-13,PCG@2015-03-13#0.png,0
1,NGG,2015-09-01,NGG@2015-09-01#0.png,0
2,MDT,2014-09-25,MDT@2014-09-25#0.png,0
3,REX,2015-06-16,REX@2015-06-16#1.png,1
4,WFC,2014-06-24,WFC@2014-06-24#0.png,0


## Merge

In [5]:
tweets = []
for i in tqdm(range(len(df_image))):
    stock = df_image.iloc[i]['stock']
    target_date = df_image.iloc[i]['target_date']
    df_sub = df_tweet[df_tweet['stock']==stock]
    df_sub = df_sub[df_sub['target_date'].between(target_date-timedelta(days=5),target_date-timedelta(days=1))]
    if len(df_sub) > 0:
        tweet = '<sep>'.join(list(df_sub.tweet.values))
        tweets.append(tweet)
    else:
        tweets.append('')

100%|██████████| 26358/26358 [01:46<00:00, 248.19it/s]


In [6]:
df_image['tweet'] = tweets
df_image.head()

Unnamed: 0,stock,target_date,image_name,label,tweet
0,PCG,2015-03-13,PCG@2015-03-13#0.png,0,
1,NGG,2015-09-01,NGG@2015-09-01#0.png,0,
2,MDT,2014-09-25,MDT@2014-09-25#0.png,0,
3,REX,2015-06-16,REX@2015-06-16#1.png,1,$ rex - credit suisse securities ( eur ) ltd f...
4,WFC,2014-06-24,WFC@2014-06-24#0.png,0,rt AT_USER $ wfc broke an all-time high ( pic ...


In [7]:
df_merge = df_image[df_image['tweet'] != '']
df_merge = df_merge.sort_values(by=['stock', 'target_date'])
df_merge = df_merge.reindex(columns=['stock', 'target_date', 'image_name', 'tweet', 'label'])
df_merge.to_csv(sys.path[-1]+'/Data/df_merge.csv', index=False)
df_merge.head()

Unnamed: 0,stock,target_date,image_name,tweet,label
2241,AAPL,2014-01-09,AAPL@2014-01-09#0.png,$ aapl i love my ipad b / c no virus and i am ...,0
3207,AAPL,2014-01-10,AAPL@2014-01-10#0.png,$ aapl please help me understand the math.bill...,0
4533,AAPL,2014-01-14,AAPL@2014-01-14#1.png,$ aapl what's behind the swift rise in apple s...,1
12903,AAPL,2014-01-15,AAPL@2014-01-15#1.png,$ aapl what's behind the swift rise in apple s...,1
5944,AAPL,2014-01-16,AAPL@2014-01-16#0.png,$ aapl expect solid results and guidance from ...,0


## Load Information

In [8]:
df = pd.read_csv(sys.path[-1]+'/Data/df_merge.csv')
df.head()

Unnamed: 0,stock,target_date,image_name,tweet,label
0,AAPL,2014-01-09,AAPL@2014-01-09#0.png,$ aapl i love my ipad b / c no virus and i am ...,0
1,AAPL,2014-01-10,AAPL@2014-01-10#0.png,$ aapl please help me understand the math.bill...,0
2,AAPL,2014-01-14,AAPL@2014-01-14#1.png,$ aapl what's behind the swift rise in apple s...,1
3,AAPL,2014-01-15,AAPL@2014-01-15#1.png,$ aapl what's behind the swift rise in apple s...,1
4,AAPL,2014-01-16,AAPL@2014-01-16#0.png,$ aapl expect solid results and guidance from ...,0


In [9]:
len(df)

18543

## Images as Arrays

In [None]:
def load_candlestick(image_path, show_plot=True):
    img = Image.open(image_path)
    img = img.convert('RGB')
    crop_size = img.size[0] * 2/15
    img = img.crop((crop_size, crop_size, img.size[0]-crop_size, img.size[0]-crop_size))
    img = np.asarray(img)/255
    if show_plot:
        plt.imshow(img)
        plt.grid(False)
        plt.axis('off')
        plt.show()
    return img

In [None]:
image_path = sys.path[-1] + '/Data/candlestick charts.zip'

image_names, features = [], []

with zipfile.ZipFile(image_path, 'r') as my_zip:
    for file_path in tqdm(my_zip.namelist()):
        if file_path[-3:] != 'png' or file_path[:2] == '__':
            continue
        # print(file_path)
        image_names.append(file_path.split('/')[1])
        with my_zip.open(file_path) as image_file:
            img = load_candlestick(image_file, show_plot=False)
            features.append(img)

100%|██████████| 52717/52717 [00:15<00:00, 3319.37it/s]


## Save Features

In [None]:
df_features = pd.DataFrame(columns=['image_name', 'feature'])
df_features['image_name'] = image_names
df_features['feature'] = features

In [None]:
df_final = pd.merge(df, df_features, on='image_name')
len(df_final)

18543

In [None]:
df_final[df_final.columns[:-1]].to_csv(sys.path[-1]+'/Data/df_final.csv', index=False)

In [None]:
image_features = np.array(df_final.feature.tolist())
image_features.shape

(18543, 100, 100, 3)

In [None]:
np.save(sys.path[-1]+'/Data/image_data.npy', image_features)