# Datamining Notebook
This notebook consists of functions that are designed to pull specific technical and sentimental information regarding a foreign exhange currency pair of your choice.

### The Information
All of the information gathered in this notebook is stored locally to a CSV file where the data is then used to create a model to predict binary options using a classification approach.

The notebook is divided into 5 sections listed in the [Table of Contents](#Table-of-Contents:).
- The first section is where the initial dataframe is collected, it includes all of the information provided by the Alpha Vantage API, which is used to collect the currency pair exchange rate from the past day, by the minute.
- The function used is relatively complex, since it corrects issues that I found in the API to ensure that the data I am collecting is relative to the time that is shown. This is vital for the correlation of the other functions that are used to collect timely infomration like news sources, tweets, and currency strength at the time.
- The other sections are relatively self-explanatory like web scrapping and twitter sentiment analysis.
- The section labeled "DATAMINING FUNCTION" is where all of the functions come together. This is where all of the information is collected.

In [2]:
# general packages
import time
import datetime
import pandas as pd
import numpy as np
from math import modf
import glob
import sys
from calendar import month_abbr

## documentation for Alpha Vantage : https://www.alphavantage.co/documentation/
from alpha_vantage.foreignexchange import ForeignExchange

# trendline analysis modules
from trendln import calc_support_resistance

# web scraping, twitter handling, and word processing modules
import bs4
import requests
import cloudscraper
from collections import Counter as count
import operator
import tweepy as tw
from textblob import TextBlob
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re

# alphavantage api key
API_KEY = 'your key here'

## Table of Contents:
- [DATAFRAME COLLECTION](#DATAFRAME-COLLECTION)
- [WEB SCRAPING](#WEB-SCRAPING)
- [TWITTER SENTIMENT ANALYSIS](#TWITTER-SENTIMENT-ANALYSIS)
- [DATAMINING FUNCTION](#DATAMINING-FUNCTION)
- [EXECUTE DATAMINING](#Execute-Datamining)

# DATAFRAME COLLECTION

In [2]:
# Indicators will be prefixed with 'ind__' - i.e. 'ind__rsi'

def intraday_data(symbol, data=None, interval='1min', key=API_KEY, first=True, restart=False):
    
    global time_dict
    global new
    fe = ForeignExchange(key=key, output_format='pandas')
    
    def call_api():
        print('Calling API for rates.')
        base = symbol[:3]
        quote = symbol[3:]
        # grab current 5 decimal exchange rate
        d,_ = fe.get_currency_exchange_rate(base, quote)
        data,_ = fe.get_currency_exchange_intraday(base, quote, interval=interval, outputsize='full')        
        data.columns = ['Open','High','Low','Close']
        data = data.sort_index(ascending=True)
        for key in d.keys():
            if 'exchange rate' in key.lower():
                cur = np.float64(d[key])
            elif 'bid price' in key.lower():
                bid = np.float64(d[key])
            elif 'ask price' in key.lower():
                ask = np.float64(d[key])
            elif 'refreshed' in key.lower():
                update = datetime.datetime.strptime(d[key][0], '%Y-%m-%d %H:%M:%S')
        print(datetime.datetime.now())
        print(data['Close'].index[-1],data['Close'][-1])
        print(d['6. Last Refreshed'][0],d['5. Exchange Rate'][0])
        return data, cur, bid, ask, update
    
    if first:
        # gather time date when begining
        initial = datetime.datetime.now()
        print('Creating initial df:',initial)
        time_dict = {'last':initial, 'second':initial.second}
        # start api search
        data, cur, bid, ask, update = call_api()
        # check if the exch rate time is a minute greater than the last data entry in the table
        if update.minute-1 == data[-1:].index[0].minute and update.second <= 10:
            # set the most recent close value to the 5 decimal value
            data.loc[data[-1:].index[0], 'Close'] = cur
            time_dict['next_index'] = update
            # create an initail pandas Timestamp object to match datatypes for indexing the table
            time_dict['initial_index'] = data[-1:].index[0]
            print('initial index:', data[-1:].index[0])
        elif update.minute == initial.minute:
            print('Intraday data is off one step; setting initial entry manually.')
            data = data[:-1]
            # set the most recent close value to the 5 decimal value
            data.loc[data[-1:].index[0], 'Close'] = cur
            time_dict['next_index'] = update
            # create an initail pandas Timestamp object to match datatypes for indexing the table
            time_dict['initial_index'] = data[-1:].index[0]
            print('initial index:', data[-1:].index[0])
        else:
            print('First time entry not valid, restart program.')
            print(update.strftime('%Y-%m-%d %H:%M'),'current ER -- vs -- last row',data[-1:].index[0])
            raise
    elif restart: # This will execute if the data 'restarted' -- had to break due to lag in API
        # start api search
        while True:
            new, cur, bid, ask, update = call_api()
            # check if the exch rate time is a minute greater than the last data entry in the table
            if update.minute-1 == new[-1:].index[0].minute and update.second <= 10:
                time_dict['last'] = time_dict['last'] + datetime.timedelta(0,60)
                # setting all previously saved data to the new dataframe -- it will only go until it left off
                new.loc[data.index] = data
                data = new.copy()
                # set the most recent close value to the 5 decimal value
                data.loc[data[-1:].index[0], 'Close'] = cur
                time_dict['next_index'] = update
            else:
                print('Need to restart program in full.')
                raise
        
    else: # this will be the most common aspect to the statement
        print('Next data entry being created. Currently in while loop waiting 1 minute.')
        while True:
            now = datetime.datetime.now()
            # this statement is checking if it has been more than a minute, on the same second (or more), and less than
            # the next update time.
            time_diff = now - time_dict['last']
            if modf(time_diff.seconds/60)[1] == 1 and now.second >= time_dict['second'] and \
            time_diff.seconds < 70:
                time_dict['last'] = time_dict['last'] + datetime.timedelta(0,60)
                try: new, cur, bid, ask, update = call_api() # call the api
                except:
                    try:
                        print('Calling API one more time.')
                        time.sleep(1)
                        # start api search
                        new, cur, bid, ask, update = call_api()
                    except:
                        print('No API call to Alphavantage. Stopping program.')
                        raise # we can just wait until next row value                
                restart = False
                break
            elif time_diff.seconds >= 70:
                # we will skip this entry and wait a minute or two until we call the api again
                # we need to replace the data we just saved with the next updated dataframe
                print('Restarting because of large time difference between entries:',time_diff,'s')
                initial_id = time_dict['initial_index']
                data = data.loc[initial_id:]
                restart = True
                return data, restart
                
            time.sleep(0.5) # last statement in while loop
        
        #display(new.tail())
        # check if the exch rate time is a minute greater than the last data entry in the table
        if (update.minute-1 == new[-1:].index[0].minute or update.minute+59 == new[-1:].index[0].minute) and update.second <= 10:
            # set the most recent close value to the 5 decimal value
            new_row = new[-1:].copy()
            time_dict['next_index'] = update
        # check the current update time, if it is a minute greater than the other, and no more than 10 s >
        elif (update.minute-1 == time_dict['next_index'].minute or update.minute+59 == time_dict['next_index'].minute) \
        and update.second-time_dict['next_index'].second <= 10:
            # trim the last row indexed because it is a false index
            new = new[:-1]
            new_row = new[-1:].copy()
            oldtime = data[-1:].index[0]
            # get next time value
            new_index = oldtime + datetime.timedelta(0,60)
            new_row['date'] = new_index
            new_row.set_index('date',inplace=True)
            time_dict['next_index'] = update
        else:
            print('Restarting because of lag in API.')
            initial_id = time_dict['initial_index']
            data = data.loc[initial_id:]
            restart = True
            return data, restart
        print('next time index:', update.strftime('%Y-%m-%d %H:%M'))
        new_row.Open[0] = data[-1:].Close[0]
        new_row.Close[0] = cur
        ind_initial = time_dict['initial_index']
        data = data.loc[ind_initial:][['Open','High','Low','Close']]
        new.loc[data.index] = data
        data = pd.concat([new[:-1], new_row])
    
    data = data[['Open','High','Low','Close']]
    # add the symbol to the dataframe
    data['symbol'] = symbol
    # these columns will be used to pull support and resistance trendlines from
    data['Close_Low_avg'] = (data.Close + data.Low) / 2
    data['Close_High_avg'] = (data.Close + data.High) / 2
    # we will use these columns as ML features for the machine
    data['move_up'] = (data['Close'] - data['Open']).apply(lambda x: 1 if x>0 else 0)
    data['wick_high'] = data.apply(lambda x: x.High - max(x.Close, x.Open), axis=1)
    data['wick_low'] = data.apply(lambda x: min(x.Close, x.Open) - x.Low, axis=1)
    restart = False
#     display(data.tail())
#     print('shape',data.shape)
    return (data, bid, ask), restart


def stochastic(data, period=5, smoothK=2, smoothD=1):

    lo = data.Low
    hi = data.High
    cl = data.Close
    
    stoch = 100 * (cl - cl.rolling(period).min()) / (cl.rolling(period).max() - cl.rolling(period).min())
    stoch_K = round(stoch.rolling(smoothK).mean(), 3)
    stoch_D = round(stoch_K.rolling(smoothD).mean(), 3)
    stoch_K.name = 'ind__stochK'
    stoch_D.name = 'ind__stochD'
    
    return stoch_K, stoch_D
    
def RSI(data, period=5):
    
    series = data.Close
    
    delta = series.diff().dropna()
    ups = delta * 0
    downs = ups.copy()
    ups[delta > 0] = delta[delta > 0]
    downs[delta < 0] = -delta[delta < 0]
    ups[ups.index[period-1]] = np.mean( ups[:period] ) #first value is sum of avg gains
    ups = ups.drop(ups.index[:(period-1)])
    downs[downs.index[period-1]] = np.mean( downs[:period] ) #first value is sum of avg losses
    downs = downs.drop(downs.index[:(period-1)])
    rs = ups.ewm(com=period-1,min_periods=0,adjust=False,ignore_na=False).mean() / \
         downs.ewm(com=period-1,min_periods=0,adjust=False,ignore_na=False).mean() 
    rsi = round(100 - 100 / (1 + rs), 3)
    rsi.name = 'ind__rsi'
    return rsi


def bollinger_bands(data, period=20, std=2, source='Close', dif_period=1):
    
    series = data[source]
    
    middle = round(series.rolling(period).mean(), 6)
    upper = round(middle + (series.rolling(period).std() * std), 6)
    lower = round(middle - (series.rolling(period).std() * std), 6)
    boll = pd.concat([middle, upper, lower], axis=1)
    boll.columns = ['ind__mid_bb','ind__up_bb','ind__low_bb']
    boll['ind__bb_dif'] = boll.ind__up_bb - boll.ind__low_bb
    boll['ind__bb_dif'] = boll.ind__bb_dif.rolling(dif_period).mean()
    
    def volatile(val, data):
        if val >= data.ind__bb_dif.quantile(0.75): return 1
        else: return 0
        
    boll['ind__volatile'] = boll.ind__bb_dif.apply(lambda x: volatile(x, boll))
    return boll


def bb_analysis(data, source='Close', window=3, sensitivity=0.85, surf_count=2):
    global up
    global cur
    
    # create series for upper and lower boundaries
    up = data['ind__up_bb']
    low = data['ind__low_bb']
    mid = data['ind__mid_bb']
    sour = data[source]
    # create indicators for relation to upper and lower bb
    data['ind__outside_up'] = (up - sour).apply(lambda x: 1 if x<0 else 0)
    data['ind__outside_low'] = (sour - low).apply(lambda x: 1 if x<0 else 0)
    # compare the rolling average of the current price vs the width of the bollinger bands
    data['ind__surf_pct'] = ((sour - low)/(up - low)).rolling(window).mean()

    return data


def get_bestfit(data, period=3):
    '''
    Going to use this to plot line of best fit to find the middle bollinger band slope at the current point.\
We are going to only apply this to the 'ENRTRY' point, and pull the past x number of points. Do this for short 3, and a long 50\
do this for CLOSING on long (will be used for current trending) and MID_BB on short.
    '''
    ser = data[-period:]
    xpts = range(0,period)
    ypts = [y for y in ser]
    vals, resid, _,_,_ = np.polyfit(xpts, ypts, deg=1, full=True)
    return vals, resid


def ind_acceleration(data_indicator, look_back=1):
    return data_indicator.diff(look_back)
    

def get_rolling(data_indicator, window=2):
    return data_indicator.rolling(window).mean()


def mins_maxs(series, limit=15):
    mins,maxs = [],[]
    lim = limit
    # add current exchange rate
    for i in np.arange(0,lim,1):
        sec = series.iloc[int((i/lim)*len(series)):int(((i+1)/lim)*len(series))]
        lastmin = max(series)
        lastmax = min(series)
        for idx,n in enumerate(sec.index):
            if sec.loc[n] < lastmin and n not in series.iloc[-20:].index:
                    minval, nmin, minidx = sec.loc[n], n, idx+len(sec)*i
                    lastmin = minval
            if sec.loc[n] > lastmax and n not in series.iloc[-20:].index:
                maxval, nmax, maxidx = sec.loc[n], n, idx+len(sec)*i
                lastmax = maxval
        try: mins.append((nmin,minval,minidx))
        except: pass
        try: maxs.append((nmax,maxval,maxidx))
        except: pass
    # next we will filter out close points and return only the max/min for the area
    mns=[]
    for i,mn in enumerate(mins):
        m=mn[2]
        if i==0 or i==len(mins)-1:
            if i==0 and mins[i+1][2]-m>10: mns.append(mn)
            elif i==0 and m==min([m,mins[i+1][2]]): mns.append(mn)
            if i==len(mins)-1 and m-mins[i-1][2]>10: mns.append(mn)
            elif i==len(mins)-1 and m==min([m,mins[i-1][2]]): mns.append(mn)
        elif mins[i+1][2]-m>10 and m-mins[i-1][2]>10: mns.append(mn)
        elif mins[i+1][2]-m>10 and m-mins[i-1][2]<=10 and m==min([m,mins[i-1][2]]): mns.append(mn)
        elif mins[i+1][2]-m<=10 and m-mins[i-1][2]>10 and m==min([m,mins[i+1][2]]): mns.append(mn)
    mxs=[]
    for i,mx in enumerate(maxs):
        m=mx[2]
        if i==0 or i==len(maxs)-1:
            if i==0 and maxs[i+1][2]-m>10: mxs.append(mx)
            elif i==0 and m==max([m,maxs[i+1][2]]): mxs.append(mx)
            if i==len(maxs)-1 and m-maxs[i-1][2]>10: mxs.append(mx)
            elif i==len(maxs)-1 and m==max([m,maxs[i-1][2]]): mxs.append(mx)
        elif maxs[i+1][2]-m>10 and m-maxs[i-1][2]>10: mxs.append(mx)
        elif maxs[i+1][2]-m>10 and m-maxs[i-1][2]<=10 and m==max([m,maxs[i-1][2]]): mxs.append(mx)
        elif maxs[i+1][2]-m<=10 and m-maxs[i-1][2]>10 and m==max([m,maxs[i+1][2]]): mxs.append(mx)
    return mns, mxs


def get_riemannsum(series, m, b):
    rsum=0
    for i,val in enumerate(series):
        y = i*m+b
        dif = y-val
        rsum += dif
    return rsum / len(series)


# add in a filter to grab the values in results where the 'xs' must contain the current exch index (len(series))
def get_trendlines(series, mins_maxs, mm_str='max'):
    results=[]
    cur = series[-1]
    cur_trend = series.diff(1).rolling(30).mean()[-1]
    for i,n in enumerate(mins_maxs):
        for ii in range(len(mins_maxs)):
            for iii in range(len(mins_maxs)):
                xinds = [n[0], mins_maxs[ii+i+1][0], mins_maxs[iii+ii+i+2][0]]
                xs = [n[2], mins_maxs[ii+i+1][2], mins_maxs[iii+ii+i+2][2]]
                ys = [n[1], mins_maxs[ii+i+1][1], mins_maxs[iii+ii+i+2][1]]
                vals = np.polyfit(xs,ys,deg=1,full=True)
                pred = len(series)*vals[0][0]+vals[0][1]
                raw = cur-pred
                dif = abs(raw)
                riemann = get_riemannsum(series, vals[0][0], vals[0][1])
                results.append((vals[0],vals[1],xs,ys,xinds,dif,riemann,raw))
                if iii+ii+i+2 == len(mins_maxs)-1:
                    break
            if ii+i+1 == len(mins_maxs)-2:
                break
        if i == int(len(mins_maxs)*(2/3)) or i==len(mins_maxs)-3: # set a stop at 2/3 of min_maxs so that we capture long trends
            break
    # keep the top 50% of results based on residuals
    rem = pd.Series([res[1] for res in results]).quantile(.50)
    if mm_str=='max':
        results = [res for res in results if (len(res[1])!=0 and res[1]<rem and res[6]>0)]
        # find the top 75th quantile for the riemann sum and only keep trendlines with the better than quantile
        quant = pd.Series([r[6] for r in results]).quantile(0.65)
        results = [res for res in results if res[6]>=quant]
    elif mm_str=='min':
        results = [res for res in results if (len(res[1])!=0 and res[1]<rem and res[6]<0)]
        # find the top 75th quantile for the riemann sum and only keep trendlines with the better than quantile
        quant = pd.Series([r[6] for r in results]).quantile(0.35)
        results = [res for res in results if res[6]<=quant]
    else: return print('Need mm_str input to be "min" or "max".')
    # now perform a sort and filter method to dwindle down the best trendline
    # the process is sort by residuals, grab 30, sort by price difference, grab 20, sort by residuals again, grab 10
    # finally sort by price difference
    results.sort(key=lambda val:val[1])
    results = results[:30]
    results.sort(key=lambda val:val[5])
    results = results[:20]
    results.sort(key=lambda val:val[1])
    results = results[:10]
    results.sort(key=lambda val:val[5])
    return results, cur_trend


def get_trendline_data(data, source='Close'):
    '''
    source could also be HighClose/LowClose avgs 
    '''
    ser1 = data[source]
    ser2 = data.iloc[int((1/3)*len(data)):][source]
    ser3 = data.iloc[int((2/3)*len(data)):][source]
    results=[]
    for i, ser in enumerate([ser1,ser2,ser3]):
        mins, maxs = mins_maxs(ser,limit=(18*(3-i)/3)) # decreasing by 1/3
        try: res_max, cur_trend = get_trendlines(ser, maxs, 'max')
        except:
            print('No resistance trendlines currently.')
            res_max, cur_trend = [],0
        # no need to store current trend twice
        try: res_min, _ = get_trendlines(ser, mins, 'min')
        except:
            print('No support trendlines currently.')
            res_min = []
        if len(res_min)==0: res_min=[[]]
        if len(res_max)==0: res_max=[[]]
        results.append((res_min[0], res_max[0], cur_trend))
    return results


def create_df(SYMBOL, data=None, first=True):
    
    global time_dict
    
    print('STEP1: Creating df object ...                                                  Time elapsed:',round(time.time()-st,2),'s')
    try:
        data, restart = intraday_data(symbol=SYMBOL, data=data, first=first)
        if restart:
            print('Had to restart, waiting til the top of next minute.')
            while True:
                if datetime.datetime.now().second <= 5:
                    break
                time.sleep(1)
            data, restart = intraday_data(symbol=SYMBOL,data=data, first=False, restart=restart)
        df, bid, ask = data
        
    except:
        e = sys.exc_info()
        print('Error getting df...')
        print(f'ValueError due to:\n{e[0]}\n{e[1]}')
        raise
    stoK, stoD = stochastic(df)
    rsi = RSI(df)
    bb = bollinger_bands(df)
    print(df.shape, stoK.shape, rsi.shape, bb.shape)
    print('STEP2: Adding stochastic, rsi and bollinger band indicators ...                Time elapsed:',round(time.time()-st,2),'s')
    df = pd.concat([df,stoK,rsi,bb], axis=1)
    df['ind__rsi_accel'] = ind_acceleration(df.ind__rsi)
    df['ind__rsi_rolling'] = get_rolling(df.ind__rsi)
    df['ind__stoK_accel'] = ind_acceleration(df.ind__stochK)
    df['ind__stoK_rolling'] = get_rolling(df.ind__stochK)
    print('STEP3: Perfomring bollinger band analysis and adding SMA slope indicator ...   Time elapsed:',round(time.time()-st,2),'s')
    df = bb_analysis(df.copy())

    return df.dropna(), bid, ask
    

def get_current_data(SYMBOL, data=None, first=True):
    global past_str, BS_count, QS_count, recent_chg
    
    df, bid, ask = create_df(SYMBOL=SYMBOL, data=data, first=first)
    print('STEP4: Creating current_data object for training data entry ...                Time elapsed:',round(time.time()-st,2),'s')   
    
    print('df info:',df[-1:].Close, df[-1:].index, df.shape)
    current_data = df.loc[df[-1:].index[0]:].copy()
    # add spread, bid-cur, ask-cur
    current_data['spread'] = ask - bid
    current_data['bid_diff'] = current_data['Close'][0] - bid
    current_data['ask_diff'] = ask - current_data['Close'][0]
    
    print('STEP5: Performing trendline analysis and adding trendline indicators ...       Time elapsed:',round(time.time()-st,2),'s')
    # handling trendline data
    # determined by whether or not the price is going towards the trendline, and it is closer to the approaching trendline than
    # the leaving trendline. if it is surpassed the closest trendline, is it heading back towards it, if so this could indicate
    # leaving a treandline and bouncing back
    def trendline_analysis(trends, line='sup',frame='full'):
        for frame_type in [('full',0),('2_3',1),('1_3',2)]:
            if frame==frame_type[0]:
                tl_frame = trends[frame_type[1]]
                tl_sup, tl_res, cur_dir = tl_frame
                if len(tl_sup)>0 and len(tl_res)>0:
                    if line=='sup' and tl_sup[5]<tl_res[5]: # first we check if the cur price is closer to support or resistance
                        # check if price difference is < 0, meaning cur price is beyond support tl
                        # then check if cur direction is moving back towards trendline
                        if tl_sup[7]>0 and cur_dir<0: return 1
                        elif tl_sup[7]<0 and cur_dir>0: return 1
                        else: return 0
                    elif line=='res' and tl_res[5]<tl_sup[5]:
                        if tl_res[7]<0 and cur_dir>0: return 1
                        elif tl_res[7]>0 and cur_dir<0: return 1
                        else: return 0
                    else: return 0
                elif len(tl_sup)==0 and line=='sup': return 0
                elif len(tl_res)==0 and line=='res': return 0
                elif len(tl_sup)>0 or len(tl_res)>0:
                    if line=='sup' and ((tl_sup[7]>0 and cur_dir<0) or (tl_sup[7]<0 and cur_dir>0)): return 1
                    elif line=='res' and ((tl_res[7]<0 and cur_dir>0) or (tl_res[7]>0 and cur_dir<0)): return 1
                    else: return 0
                else: return 0
                
    trend_frame = get_trendline_data(df.copy(), source='Close')
    current_data['ind__full_near_res'] = trendline_analysis(trend_frame, 'res', 'full')
    current_data['ind__full_near_sup'] = trendline_analysis(trend_frame, 'sup', 'full')
    current_data['ind__2_3_near_res'] = trendline_analysis(trend_frame, 'res', '2_3')
    current_data['ind__2_3_near_sup'] = trendline_analysis(trend_frame, 'sup', '2_3')
    current_data['ind__1_3_near_res'] = trendline_analysis(trend_frame, 'res', '1_3')
    current_data['ind__1_3_near_sup'] = trendline_analysis(trend_frame, 'sup', '1_3')
    
    print('STEP6: Scraping livecharts.co website for currency strength indicators ...     Time elapsed:',round(time.time()-st,2),'s')
    # create the currency strength indicators
    try:
        base_str, quote_str, str_rat, cur_dict = get_currency_strength(symbol=SYMBOL)
    except:
        e = sys.exc_info()
        print(f'livecharts.co.uk/currency-strength.php unavailable for the moment:\ndue to:\n{e[0]}\n{e[1]}')
        base_str, quote_str, str_rat = (0,0,0)
    
    # when training the machine, we will perform feature engineering on the categorical data
    current_data['ind__base_strength'] = base_str
    current_data['ind__quote_strength'] = quote_str
    current_data['ind__strength_ratio'] = str_rat
    current_data['ind__BS_diff'] = base_str - past_str['base']
    current_data['ind__QS_diff'] = quote_str - past_str['quote']
    
    # tracking recent big changes to currencies
    if abs(base_str - past_str['base']) >= 2:
        recent_chg['base'] = 1
        BS_count = 0
    else:
        BS_count += 1
        if BS_count >= 5:
            recent_chg['base'] = 0
    if abs(quote_str - past_str['quote']) >= 2:
        recent_chg['quote'] = 1
        QS_count = 0
    else:
        QS_count += 1
        if QS_count >= 5:
            recent_chg['quote'] = 0
    current_data['ind__BS_pastchg'] = recent_chg['base']
    current_data['ind__QS_pastchg'] = recent_chg['quote']
    # set past strengths
    past_str['base'] = base_str
    past_str['quote'] = quote_str
    
    # get slopes and residuals
    vals, _ = get_bestfit(df.ind__mid_bb, period=3)
    current_data['ind__midbb_slope'] = vals[0]
    # the intensions are to grab the sum of least squared value and use this to determine a quick trend
    # pair it with the past 5 up_tick value to determine if it is going in one direction
    _, resid = get_bestfit(df.Close, period=5)
    current_data['ind__trend_residuals'] = resid[0]
    
    return current_data, df

# WEB SCRAPING
Scraping Forex Factory website for news updates and how they affect the around the time of the events.

Scraping livecharts website for currency strengths.

[To top^^^](#Table-of-Contents:)

In [3]:
# livecharts scraping function
def get_currency_strength(symbol):
    '''
    For instance, if a certain currency is very strong, and another suddenly turns weaker, you may find a trading opportunity.\
Such deviation between pairs usually indicates momentum. Conversely, if two currencies are weak, strong or average strength,\
there is often a range or sideways movement happening. 
    '''
    base = symbol[:3]
    quote = symbol[3:]
    
    # EUR tick is EURO on website
    if base == 'EUR': base='EURO'
    if quote == 'EUR': quote='EURO'
        
    cur_dict = {}
    
    res = requests.get('http://www.livecharts.co.uk/currency-strength.php')
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    txt = soup.select('#rate-outercontainer')
    for item in txt:
        strength = 6 - str(item).count('image:none')
        currency = item.find(id='map-innercontainer-symbol').contents[0]
        cur_dict[currency] = strength
        
    base_strength = cur_dict[base]
    quote_strength = cur_dict[quote]
    strength_ratio = quote_strength/base_strength
    
    return base_strength, quote_strength, strength_ratio, cur_dict
# Done with livecharts scraping
##
### -----------------------------------------------------------------
##
# Forex Factory scraping beyond this point
# we are looking at all calendar events for the week
def find_eventid(soup):
    '''
    We are looking at all calendar events for the week. In this case, there are multiple classes that the 'data-eventid' can\
fall under, which is why we need to filter through all repeated values.
    '''
    past = 0
    eventid = []
    event_list = soup.find(class_="calendar__table").find_all(class_='calendar__row')
    for item in event_list:
        if 'data-eventid' in item.attrs.keys():
            id_ = item.attrs['data-eventid']
            if id_ == past: pass
            else: 
                eventid.append(id_)
                past = id_
    return eventid


def event_details(event_id):
    scraper = cloudscraper.create_scraper(interpreter='nodejs')
    # the base url is neccessary since the web page is dynamic - it was found through an element analysis of the website
    base_url = 'https://www.forexfactory.com/flex.php?do=ajax&contentType=Content&flex=calendar_mainCalCopy1&details='
    eventid_url = event_id
    detailed_url = base_url+eventid_url
    res = scraper.get(detailed_url)
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    
    # create a all list/dictionary variables
    stories, specs, spec_content = [], [], []
    spec_dict = {}
    
    for item in soup.find_all(class_="flexposts__story-title"):
        stories.append(item.select('a')[0].attrs['title'])
    stories
    for item in soup.find_all('tr'):
        for x in item.find_all(class_='label calendarspecs__spec'):
            specs.append(x.contents[0].replace('\n','').replace('\t',''))
    specs
    for item in soup.find_all('tr'):
        for x in item.find_all(class_='full calendarspecs__specdescription'):
            spec_content.append(x.contents[0])
    spec_content
    spec_dict={specs[i]:spec_content[i] for i in range(len(specs))}
    return stories, spec_dict


def event_impact(soup):
    impact = []
    for item in soup.find(class_="calendar__table").find_all(class_='calendar__impact'):
        if len(item.select('span')) > 0:
            it = item.select('span')[0].attrs
            impact.append(it['class'][0])
    return impact


def currency_impacted(soup):
    currency = []
    symbols = soup.find(class_="calendar__table").find_all(class_="calendar__cell calendar__currency currency")
    for item in symbols:
        clean = item.contents[0].replace('\n','')
        currency.append(clean)
    return currency


def find_date(soup, year='2020'):
    date = soup.find(class_='date').find(class_='date').contents[1].contents[0]
    time = soup.find(class_="calendar__table").find(title='Time Options').contents[0]
    datetime_str = date+' '+year+' '+time
    datetime_obj = datetime.datetime.strptime(datetime_str, '%b %d %Y %I:%M%p')
    return datetime_obj


def find_event_time(soup, date):
    # parse all times per event and store in list
    times = []
    results = []
    find_date = soup.find(class_="calendar__table").find_all(class_='calendar__cell calendar__date date')
    # get the month_day into the same form that it is returned by the above cell
    now = datetime.datetime.now()
    month_day = ' '.join([month_abbr[now.month], str(now.day)])
    for i,time in enumerate(soup.find(class_="calendar__table").find_all(class_='calendar__cell calendar__time time')):
        if len(time.contents)==0: times.append(last)
        else:
            if len(find_date[i]) > 1: month_day = find_date[i].contents[1].contents[1].contents[0]
            typ = type(time.contents[0])
            item = time.contents
            if typ == bs4.element.NavigableString:
                times.append((month_day, item[0]))
                last = (month_day, item[0])
            elif typ == bs4.element.Tag:
                times.append((month_day, item[1].contents[1]))
                last = (month_day, item[1].contents[1])
    for time in times:
        if time[1] == 'All Day': result = time[1]
        elif time[1] == 'Tentative': result = time[1]
        elif ':' in time[1]:
            spl_m_d = time[0].split()
            month = spl_m_d[0]
            day = spl_m_d[1]
            date_ = ' '.join([month,day,str(date.year)])
            datetime_str = ' '.join([date_, time[1]])
            datetime_obj = datetime.datetime.strptime(datetime_str, '%b %d %Y %I:%M%p')
            result = datetime_obj
        else: result='N/A'
        results.append(result)
    return results


def time_until_event(event_times, current_time):
    event_time_list = []
    for time in event_times:
        if type(time) == datetime.datetime:
            delta = time - current_time
            if delta.days >= 0:
                mins_until_event = delta.seconds / 60
                event_time_list.append(mins_until_event)
            else: event_time_list.append(('past', (current_time-time).seconds/60))# return a tuple showing how recent event was
        else: event_time_list.append(time)
    return event_time_list


def detail_collection(use_symbols):
    detail_dict = {}
    for i, sym, id_ in use_symbols:
        detail_dict[f'{sym}_{i}'] = event_details(id_)
    return detail_dict


def forecast_indicator(soup, detail_dict, use_symbols):
    
    def usual_effect(detail_dict):
        '''
        1 - 'Forecast' greater than 'Previous' good for currency
        0 - 'Forecast' less than 'Previous' good for currency
        'none' - Nothing to find
        '''
        effect = []
        for key in detail_dict.keys():
            details = detail_dict[key][1]
            if 'Usual Effect' in details.keys():
                desc = details['Usual Effect']
                if 'greater' in desc:
                    # meaning 'Actual' greater than 'Forecast' is GOOD for currency
                    # we will use this to identify if 'forecast' > 'previous' does anything to the market
                    effect.append(1)
                elif 'less' in desc: effect.append(0)
                else: effect.append('none')
            else: effect.append('none')
        return effect
    
    def find_previous_forecast(soup, use_symbols, prev_fore='previous'):
        '''
        Only two arguments for prev_fore are 'previous' and 'forecast'.
        '''
        index_vals = [i for i,sym,id_ in use_symbols]
        val_list = []
        find_str = f'calendar__cell calendar__{prev_fore} {prev_fore}'
        for item in soup.find(class_="calendar__table").find_all(class_=find_str):
            cont = item.contents
            if len(cont)==0: val_list.append('none')
            elif len(cont)==1:
                typ = type(cont[0])
                if typ == bs4.element.Tag:
                    revised = cont[0].attrs['class']
                    if len(revised)>1:
                        outcome = revised[1]
                        val_list.append((outcome, cont[0].contents[0]))
                    else: val_list.append(cont[0].contents[0])
                elif typ == bs4.element.NavigableString: val_list.append(cont[0])
            else: val_list.append('none')
        return [val_list[i] for i in index_vals]

    def forecast_predict(prev_vals, fore_vals, rules):
        direction = []
        for i, pre in enumerate(prev_vals):
            if type(pre) == tuple: pre_ = pre
            else: pre_ = ('none',pre)
            if rules[i] == 'none' or pre_[1] == 'none' or fore_vals[i] == 'none': direction.append(('none','none'))
            elif rules[i] == 0:
                if pre_[1] - fore_vals[i] > 0: direction.append((pre_[0],'right'))
                elif pre_[1] - fore_vals[i] < 0: direction.append((pre_[0],'wrong'))
                else: direction.append((pre_[0],'same'))
            elif rules[i] == 1:
                if fore_vals[i] - pre_[1] > 0: direction.append((pre_[0],'right'))
                elif fore_vals[i] - pre_[1] < 0: direction.append((pre_[0],'wrong'))
                else: direction.append((pre_[0],'same'))
        return direction
    
    def get_floats(val_list):
        float_vals = []
        for val in val_list:
            try:
                float_vals.append(float(val))
            except:
                if val == 'none': float_vals.append(val)
                elif type(val)==tuple: float_vals.append((val[0], float(val[1][:-1])))
                else:
                    try:
                        float_vals.append(float(val[:-1]))
                    except:
                        float_vals.append('none')
        return float_vals
    
    rules = usual_effect(detail_dict)
    prev_vals = get_floats(find_previous_forecast(soup, use_symbols, prev_fore='previous'))
    fore_vals = get_floats(find_previous_forecast(soup, use_symbols, prev_fore='forecast'))
    
    return forecast_predict(prev_vals, fore_vals, rules)
    
    
def find_actual(soup, time_til_event):
    actual = []
    for i,item in enumerate(soup.find(class_="calendar__table").find_all(class_='calendar__cell calendar__actual actual')):
        cont = item.contents
        if len(cont)==0:
            if type(time_til_event[i])==tuple: actual.append('none')
            else: actual.append('still to come')
        elif len(cont)==1:
            cont = cont[0]
            typ = type(cont)
            if typ == bs4.element.Tag:
                outcome = cont.attrs['class'][0]
                actual.append(outcome)
            elif typ == bs4.element.NavigableString: actual.append('neutral')
        else: actual.append('none')
    return actual


def get_ff_alert(detail_dict):
    alerts = []
    for event in detail_dict.keys():
        if 'FF Alert' in list(detail_dict[event][1].keys()): alerts.append(1)
        else: alerts.append(0)
    return alerts


def symbols_to_use(symbol, event_ids, currencies):
    '''
    returns a list of index values to source data-eventids
    '''
    base, quote = symbol[:3], symbol[3:]
    return [(i,sym, event_ids[i]) for i, sym in enumerate(currencies) if sym in [base, quote]]


def event_sentiment(detail_dict):
    
    def get_sentiment(comment):
        cleaned = text_process(comment, tokenize=False)
        sentiment = TextBlob(cleaned).sentiment
        return sentiment
    
    sentiments = []
    for key in detail_dict.keys():
        stories = detail_dict[key][0]
        details = detail_dict[key][1]
        polarity = []
        pol_neutral = []
        if len(stories) != 0:
            for story in stories:
                sent = get_sentiment(story)
                pol, subj = sent.polarity, sent.subjectivity
                if subj<0.7 and pol>0: polarity.append('positive')
                elif subj<0.7 and pol<0: polarity.append('negative')
                elif subj<0.7 and pol==0: pol_neutral.append(1)
        for spec in details.keys():
            if 'traders' in spec.lower() or 'ff notes' in spec.lower():
                sent = get_sentiment(details[spec])
                pol, subj = sent.polarity, sent.subjectivity
                if subj<0.7 and pol>0: polarity.append('positive')
                elif subj<0.7 and pol<0: polarity.append('negative')
                elif subj<0.7 and pol==0: pol_neutral.append(1)
        sentiment = (dict(count(polarity)), np.sum(pol_neutral))
        pol, neut = sentiment
        if len(pol) > 0 and neut == 0:
            if len(pol) == len(np.unique(pol)): sentiments.append(max(pol.items(), key=operator.itemgetter(1))[0])
            else: sentiments.append('neutral')
        elif len(pol) > 0 and neut > 0:
            vals = list(pol.values())
            if neut > max(vals): sentiments.append('neutral')
            elif len(pol) == len(np.unique(pol)): sentiments.append(max(pol.items(), key=operator.itemgetter(1))[0])
            else: sentiments.append('neutral')
        else: sentiments.append('neutral')
    return sentiments


# this is the only function that we are pulling from
def FF_main_scrape(symbol):
    # scope the url to only view today and one day in advance
    from calendar import month_abbr, day_name
    now = datetime.datetime.now()
    month = month_abbr[now.month].lower()
    tod = '.'.join([(month+str(now.day)), str(now.year)])
    if day_name[now.weekday()]=='Friday': date_str = ''.join(['day=',tod])
    else:
        nextnow = now + datetime.timedelta(1) # add a day
        month = month_abbr[nextnow.month].lower()
        tom = '.'.join([(month+str(nextnow.day)), str(nextnow.year)])
        date_str = ''.join(['range=',tod,'-',tom])
                
    base_pre = 'https://www.forexfactory.com/calendar?'
    search_url = ''.join([base_pre,date_str])
    try:
        scraper = cloudscraper.create_scraper()
        res = scraper.get(search_url)
        soup = bs4.BeautifulSoup(res.text, 'lxml')
    except:
        print('Trying soup again...')
        time.sleep(2)
        scraper = cloudscraper.create_scraper()
        res = scraper.get(search_url)
        soup = bs4.BeautifulSoup(res.text, 'lxml')
    print('Got the soup.')
    scrape_date = find_date(soup)
    event_times = find_event_time(soup, scrape_date) # maybe useful
    time_til_events = time_until_event(event_times, scrape_date) # useful
    # pull currencies and event ids
    currencies = currency_impacted(soup) # symbol
    event_ids = find_eventid(soup)
    use_symbols = symbols_to_use(symbol, event_ids, currencies)
    # pull impact indication
    impact = event_impact(soup) # indicator
    actuals = find_actual(soup, time_til_events) # indicator
    detail_dict = detail_collection(use_symbols)
    forecasts = forecast_indicator(soup, detail_dict, use_symbols) # indicator
    alerts = get_ff_alert(detail_dict) # indicator
    sentiments = event_sentiment(detail_dict) # indicator
    
    store_dict = {}
    result_dict = {}

    for n,sym_tup in enumerate(use_symbols):
        i,sym,_ = sym_tup
        store_dict[sym+'_'+str(i)] ={
            'event_time':event_times[i], 
            'time_until':time_til_events[i],
            'impact_level':impact[i],
            'actual_result':actuals[i],
            'forecast':forecasts[n],
            'ff_alert':alerts[n],
            'event_sentiment':sentiments[n]
        }
    for key in store_dict.keys():
        cur = key.split('_')[0]
        if cur not in result_dict.keys(): result_dict[cur] = [store_dict[key]]
        else: result_dict[cur].append(store_dict[key])
    return result_dict

# TWITTER SENTIMENT ANALYSIS
Performs a sentiment analysis of public tweets and trader success over the past XX number of tweets

[To top^^^](#Table-of-Contents:)

In [4]:
def text_process(comment, tokenize=True):
    # strip emojis/pictographs/symbols/etc...
    comment = comment.encode('ascii', 'ignore').decode('ascii')
    # create tokens to clean
    tokens = comment.split(' ')
    cleaned_tokens = []
    # clean out any url and user tag
    for token in tokens:
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        # take out all numbers
        token = re.sub('\d+','',token)
        cleaned_tokens.append(token)
    cleaned_tokens = ' '.join(cleaned_tokens)
    # lemmatization - like stemming, trying to get the word to its root
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in cleaned_tokens]
    # remove punctuation
    nopunc = [char for char in lemmatized if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    # for tokenization, take away the 'join' method
    if tokenize: return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])


# dictionary containing twitter api access tokens
access_dict = {
               'access_1':{'API_KEY':'',
                          'API_SECRET':'',
                          'ACCESS_TOKEN':'',
                          'ACCESS_TOKEN_SECRET':''},
               'access_2':{'API_KEY':'',
                          'API_SECRET':'',
                          'ACCESS_TOKEN':'',
                          'ACCESS_TOKEN_SECRET':''},
              }

def set_api(key, keys):
    API_KEY = access_dict[key]['API_KEY']
    API_SECRET = access_dict[key]['API_SECRET']
    ACCESS_TOKEN = access_dict[key]['ACCESS_TOKEN']
    ACCESS_TOKEN_SECRET = access_dict[key]['ACCESS_TOKEN_SECRET']
    auth = tw.OAuthHandler(API_KEY, API_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    api = tw.API(auth)
    # check status of api
    status = api.rate_limit_status()['resources']['search']['/search/tweets']
    remaining, next_update = status['remaining'], status['reset']
    # check if api
    if remaining <= 30:
        print('Changing api access tokens. Resetting in',round((next_update-time.time())/60, 2),'minutes\n')
        keys_ = keys
        keys_.remove(key)
        key = keys_[0]
        api = set_api(key, keys)
    return api    


def minute_tweet_freq(symbols, current_time, num_tweets=150):
    '''
    pass in symbols as a list
    200 tweets was roughly an hour of tweet backlog for 'EURUSD'
    
    We will use this function to create our own NLP sentiment analysis.
    The process will be roughly collect tweets, collect past intraday data, and find a correlation between words and market \
trends
    '''
    
    # set api with first account in dict
    keys = list(access_dict.keys())
    api = set_api(keys[0], keys)
    for symbol in symbols:    
        # when using tweet_mode='extended', the 'text' attribute becomes 'full_text'
        tm='extended'
        search_words = symbol
        public_tweets = tw.Cursor(api.search, q=search_words, lang='en', tweet_mode=tm).items(num_tweets)
        # time difference is 6 hour for time zone, and subtract 60 seconds for past minute of tweets
        past_minute = (current_time+datetime.timedelta(0,21540)).strftime('%Y-%m-%d %H:%M')
        created = [[tweet.created_at] for tweet in public_tweets if tweet.created_at > pd.Timestamp(past_minute)]
        # df = pd.DataFrame(data=created, columns=["created_at"])

    return len(created)


# function for machine
def twitter_sentiment(symbols, current_time, num_tweets=350):
    '''
    pass in symbols as a list
    '''

    # set api with first account in dict
    keys = list(access_dict.keys())
    api = set_api(keys[0], keys)
    # set variables
    past_tweet = str()
    public_sentiment, trader_sentiment = [], []

    for symbol in symbols:    
        # when using tweet_mode='extended', the 'text' attribute becomes 'full_text'
        tm='extended'
        search_words = symbol
        public_tweets = tw.Cursor(api.search, q=search_words, lang='en', tweet_mode=tm).items(num_tweets)
        # time difference is 6 hour for time zone, and subtract 60 seconds for past minute of tweets
        past_minute = (current_time+datetime.timedelta(0,21540)).strftime('%Y-%m-%d %H:%M')
        # collect past 2 hours of tweets to create sentiment from
        past_2hrs = (current_time+datetime.timedelta(0,14400)).strftime('%Y-%m-%d %H:%M')
        # counting amount of tweets
        recent_tweets = [[tweet.full_text, tweet.created_at] for tweet in public_tweets if tweet.created_at>=pd.Timestamp(past_2hrs)]
        tweet_freq = len([tweet for tweet in recent_tweets if tweet[1]>=pd.Timestamp(past_minute)])
        results, trade_results, clean_tweets = [],[],[]
        # sentiment analysis
        for tweet in recent_tweets:
            text = tweet[0]
            # we will use the 'match' to determine if a trader won or lost a position
            # these traders were found during the analysis, a tweet whenever a position is closed
            # assuming the position was taked off of an analysis, the win or loss can help us identify sentiment
            match = re.findall(fr'Closed \w+[ \w+ ]?\w+ \d+.\d+ Lots [$#]?{symbol}|Closed \w+ {symbol}', text)
            if len(match)>0:
                buy_sell = re.findall('(Buy|Sell)', text)
                win_lose = re.findall('for \W?', text)[0].split()
                if len(win_lose) > 1:
                    outcome = win_lose[1]
                    if outcome == '+': trade_results.append(1)
                    elif outcome == '-': trade_results.append(-1)
                else: trade_results.append(-1)
                # we dont want to include this in the sentiment analysis so we will just continue to the next tweet
                continue
            # checking to see if a bot RT multiple times (found during analysis)
            if text == past_tweet: continue
            else:
                past_tweet = text
                cleanedtext = text_process(text, tokenize=False)
                # checking if a RT came up again from the same user
                if cleanedtext in clean_tweets:
                    continue
                else:
                    clean_tweets.append(cleanedtext)
                    sentiment = TextBlob(cleanedtext).sentiment
                    pol, subj = sentiment.polarity, sentiment.subjectivity
                    if subj < 0.7:
                        if pol > 0: results.append(1)
                        elif pol < 0: results.append(-1)
                        else: results.append(0)
                            
        res = np.sum(results)/len(results)
        if res > 0 and abs(res) > 0.1: public_sentiment.append(('positive', res))
        elif res < 0 and abs(res) > 0.1: public_sentiment.append(('negative', res))
        else: public_sentiment.append(('neutral', res))
        if len(trade_results) != 0:
            tr_res = np.sum(trade_results)/len(trade_results)
            if tr_res > 0 and abs(tr_res) > 0.1: trader_sentiment.append(('positive', tr_res))
            elif tr_res < 0 and abs(tr_res) > 0.1: trader_sentiment.append(('negative', tr_res))
            else: trader_sentiment.append(('neutral', tr_res))
        else: trader_sentiment.append(('none',0))
        
    return public_sentiment, trader_sentiment, tweet_freq

# DATAMINING FUNCTION

[To top^^^](#Table-of-Contents:)

In [5]:
def create_training_data(SYMBOL, pos_time_int=3.0):
    '''
    pos_time_int == position time interval - indicates actual time interval used on Videforex
    '''
    global df, entry
    global past_str, BS_count, QS_count, recent_chg
    global st
    global event_time_dict
    
    # identify base/quote currencies
    base_cur = SYMBOL[:3]
    quote_cur = SYMBOL[3:]
    # create empty dictionary for tracking currency strength
    try:
        base_str, quote_str, str_rat, cur_dict = get_currency_strength(symbol=SYMBOL)
    except:
        e = sys.exc_info()
        print(f'livecharts.co.uk/currency-strength.php unavailable for the moment:\ndue to:\n{e[0]}\n{e[1]}')
        base_str, quote_str, str_rat = (0,0,0)
    past_str = {'base':base_str,'quote':quote_str}
    BS_count = 0
    QS_count = 0
    recent_chg = {'base':0, 'quote':0}
    # create empty dict for appending results
    store_entry = {}
    # create empty dict for adding FF event times and adding arbitrary large value to key 'update_time' for initial loop
    event_time_dict = {}
    # create empty dict for storing time to update twitter sentiment
    twitter_time = {}
    # initialize loop count
    i = 0
    
    print('CURRENCY PAIR:',SYMBOL,'\n')
    while True:
        st = time.time()
        print('START TIME:', datetime.datetime.now())
        if i==0:
            entry, df = get_current_data(SYMBOL=SYMBOL, data=None, first=True)
            # this is where we will create first instance of FF data
        else: entry, df = get_current_data(SYMBOL=SYMBOL, data=df.copy(), first=False)
        # twitter sentiment analysis
        # each symbol mined is roughly 5 seconds at 150 tweets
        # we can plan to pull data from this every 5 minutes
        if i==0:
            twitter_time['twitter_start'] = datetime.datetime.now()
            print('TWITTER SENTIMENT ANALYSIS BEGINING ...                                        Time elapsed:',round((time.time() - st), 2),'s')
            twit_sent_pub, twit_sent_trad, freq = twitter_sentiment([SYMBOL], datetime.datetime.now(), num_tweets=350)
        else:
            try:
                secs = 900 # 15 minutes
                if (datetime.datetime.now() - twitter_time['twitter_start']).seconds > secs: # 15 minutes
                    twitter_time['twitter_start'] = twitter_time['twitter_start'] + datetime.timedelta(0,secs)
                    print('TWITTER SENTIMENT ANALYSIS BEGINING ...                                        Time elapsed:',round((time.time() - st), 2),'s')
                    # 200 tweets is a little over an hour of tweets - on average?
                    try: twit_sent_pub, twit_sent_trad, freq = twitter_sentiment([SYMBOL], datetime.datetime.now(), num_tweets=350)
                    except:
                        print('Twitter not available.')
                        twit_sent_pub, twit_sent_trad, freq = [(0,0)],[(0,0)],0
                else:
                    print('Geting tweet frequency over the last minute.')
                    freq = minute_tweet_freq([SYMBOL], datetime.datetime.now(), num_tweets=150)
            except: 
                print('Twitter unavailable.')
                pass
        # we are only going to record the numerical polarity summation
        entry['ind__pub_twit_pol'] = twit_sent_pub[0][1]
        entry['ind__trad_twit_pol'] = twit_sent_trad[0][1]
        entry['ind__numtweets_1min'] = freq
        print('DONE ...                                                                       Time elapsed:',round((time.time() - st), 2),'s')
        
# -----------------------------------------------------------------------------------------------------------------------------        
        
        # FF data pull - keep this in here to keep 'event_time_dict' local
        ## ACCESS CURRENTLY DENIED - negative
        # ran into a 1020 Error where my IP was blocked, should look at rotating IP addresses or getting on a VPN
        
        # maybe make initial pull a function to use in 'else' statement as well
        # WE SHOULD HAVE THIS IF STATEMENT CHECK IF THERE IS INFORMATION IN event_time_dict...
        if i==0:
            # if we dont do anything with the time below, just replace in the print statement with datetime object
            print('SCRAPING FOREX FACTORY WEBSITE ...                                             Time elapsed:',round((time.time() - st), 2),'s')
            FF_dict = FF_main_scrape(SYMBOL)
            try: base_events = FF_dict[base_cur]
            except: base_events = 'empty'
            try: quote_events = FF_dict[quote_cur]
            except: quote_events = 'empty'
            # time until next events
            try:
                event_time_dict['next_event_base'] = min([time for time in [base['time_until'] for base in base_events] \
                                                      if type(time) in [int, float]])
            except:
                print('No more events in near future for base currency %s' % base_cur)
                event_time_dict['next_event_base'] = 'none'
            try:
                event_time_dict['past_event_base'] = min([time[1] for time in [base['time_until'] for base in base_events] \
                                                     if type(time)==tuple])
            except: 
                print('No past events for base currency %s' % base_cur)
                event_time_dict['past_event_base'] = 'none'
            try:
                event_time_dict['next_event_quote'] = min([time for time in [quote['time_until'] for quote in quote_events] \
                                                       if type(time) in [int, float]])
            except:
                print('No more events in near future for quote currency %s' % quote_cur)
                event_time_dict['next_event_quote'] = 'none'
            try:
                event_time_dict['past_event_quote'] = min([time[1] for time in [quote['time_until'] for quote in quote_events] \
                                                     if type(time)==tuple])
            except:
                print('No past events for quote currency %s' % quote_cur)
                event_time_dict['past_event_quote'] = 'none'
            # set time for updating scrape next
            try:
                if type(event_time_dict['next_event_base']) in [float, int] and type(event_time_dict['next_event_quote']) in [float, int]:
                    event_time_dict['update_time'] = min(event_time_dict['next_event_base'],event_time_dict['next_event_quote'])
                elif type(event_time_dict['next_event_base']) in [float, int]:
                    event_time_dict['update_time'] = event_time_dict['next_event_base']
                elif type(event_time_dict['next_event_quote']) in [float, int]:
                    event_time_dict['update_time'] = event_time_dict['next_event_quote']
                else:
                    print('No upcoming events ... %s' % SYMBOL)
                    event_time_dict['update_time'] = 'none'
            except:
                print('No upcoming events ... %s' % SYMBOL)
                event_time_dict['update_time'] = 'none'
        else:
            # subtract minute from event times because we will have waited 1 minute since updating dataframe
            if event_time_dict['update_time'] != 'none': event_time_dict['update_time'] -= 1
            if event_time_dict['next_event_quote'] != 'none': event_time_dict['next_event_quote'] -= 1
            if event_time_dict['past_event_quote'] != 'none': event_time_dict['past_event_quote'] += 1
            if event_time_dict['next_event_base'] != 'none': event_time_dict['next_event_base'] -= 1
            if event_time_dict['past_event_base'] != 'none': event_time_dict['past_event_base'] += 1
            # create a method around when to search for this, near a currency symbol effected time
            print('Minutes until next event: ',event_time_dict['update_time'])
            # time is in MINUTES until next event
            if event_time_dict['update_time'] == 'none':
                print('No upcoming events ... %s' % SYMBOL)
                #
                #
                # we should set a timer here to call function FF_main_scrape() after however many hours we want
                #
                #
                pass                
            elif event_time_dict['update_time']==10 or event_time_dict['update_time']<0:
                # reset all times for next event coming up
                print('SCRAPING FOREX FACTORY WEBSITE ...                                         Time elapsed:',round((time.time() - st), 2),'s')
                FF_dict = FF_main_scrape(SYMBOL)
                base_events = FF_dict[base_cur]
                quote_events = FF_dict[quote_cur]
                # time until next events
                try:
                    event_time_dict['next_event_base'] = min([time for time in [base['time_until'] for base in base_events] \
                                                          if type(time) in [int, float]])
                except:
                    print('No more events in near future for base currency %s' % base_cur)
                    event_time_dict['next_event_base'] = 'none'
                try:
                    event_time_dict['past_event_base'] = min([time[1] for time in [base['time_until'] for base in base_events] \
                                                         if type(time)==tuple])
                except: 
                    print('No past events for base currency %s' % base_cur)
                    event_time_dict['past_event_base'] = 'none'
                try:
                    event_time_dict['next_event_quote'] = min([time for time in [quote['time_until'] for quote in quote_events] \
                                                           if type(time) in [int, float]])
                except:
                    print('No more events in near future for quote currency %s' % quote_cur)
                    event_time_dict['next_event_quote'] = 'none'
                try:
                    event_time_dict['past_event_quote'] = min([time[1] for time in [quote['time_until'] for quote in quote_events] \
                                                         if type(time)==tuple])
                except:
                    print('No past events for quote currency %s' % quote_cur)
                    event_time_dict['past_event_quote'] = 'none'
                # set time for updating scrape next
                try:
                    if type(event_time_dict['next_event_base']) in [float, int] and type(event_time_dict['next_event_quote']) in [float, int]:
                        event_time_dict['update_time'] = min(event_time_dict['next_event_base'],event_time_dict['next_event_quote'])
                    elif type(event_time_dict['next_event_base']) in [float, int]:
                        event_time_dict['update_time'] = event_time_dict['next_event_base']
                    elif type(event_time_dict['next_event_quote']) in [float, int]:
                        event_time_dict['update_time'] = event_time_dict['next_event_quote']
                    else:
                        print('No upcoming events ... %s' % SYMBOL)
                        event_time_dict['update_time'] = 'none'
                except:
                    print('No upcoming events ... %s' % SYMBOL)
                    event_time_dict['update_time'] = 'none'
        # we shouldn't have conflicting datatypes (int, string)
        # what should 'none' be instead
        entry['ind__base_mins_to_next'] = event_time_dict['next_event_base']
        entry['ind__quote_mins_to_next'] = event_time_dict['next_event_quote']            
        # looking at trading view and FF, we will use 1hr as a benchmark for 'recent event' effecting the market
        # this value is based on how the market acted around recent events, but is also not a perfect time
        # maybe an analysis should be done on this to determine best time
        try:
            if event_time_dict['past_event_base'] <= 60: rec_base = 1
            else: rec_base = 0
        except: rec_base = 0
        try:
            if event_time_dict['past_event_quote'] <= 60: rec_quote = 1
            else: rec_quote = 0
        except: rec_quote = 0
        entry['ind__base_recentevent'] = rec_base
        entry['ind__quote_recentevent'] = rec_quote
        # add on rest of indicators
        if base_events != 'empty': entry['ind__base_numevents'] = len([event for event in base_events if \
                                                                      type(event['time_until']) in [float, int]])
        else: entry['ind__base_numevents'] = 0
        if quote_events != 'empty': entry['ind__quote_numevents'] = len([event for event in quote_events if \
                                                                      type(event['time_until']) in [float, int]])
        else: entry['ind__quote_numevents'] = 0
        # find IMPACT of upcoming event and most recent event, for both currencies
        # also get the forecast of these events, and the actual outcome of past events
        # we are going to string together the impact with the actual/forecast        
        # upcoming
        for cur in ['base','quote']:
            if cur=='base': cur_ = base_events
            else: cur_ = quote_events
            if event_time_dict['next_event_%s' % cur] == 'none': 
                ev_fore = 'none'
                ev_alert = 'none'
                ev_sent = 'none'
            else:
                # check if there are more than 1 events coming up
                upcoming = [item for item in cur_ if type(item['time_until']) in [int, float]]
                upcoming.sort(key=lambda val:val['time_until'])
                # now check upcoming forecast and ff alerts
                # pull index 1 from the tuple - index 1 is the forecast prediction, index 0 shows if previous was updated
                forecast = [(fore['impact_level'],fore['forecast'][1]) for fore in [event for event in upcoming \
                                                             if event['time_until']==upcoming[0]['time_until']]]
                ff_alert = [(alert['impact_level'],str(alert['ff_alert'])) for alert in [event for event in upcoming if event['time_until']==\
                                                            upcoming[0]['time_until']] if alert['ff_alert']==1]
                sentiment = [(sent['impact_level'],sent['event_sentiment']) for sent in [event for event in upcoming \
                                                             if event['time_until']==upcoming[0]['time_until']]]                
                # compile event indicators
                # check event forecast
                event_forecast = ['_'.join(forecast[i]) for i in range(len(forecast))]
                if len(event_forecast)==1: ev_fore = event_forecast[0]
                else:
                    if len(np.unique(event_forecast))==1: ev_fore = '_'.join(['multiple',event_forecast[0]])
                    else: ev_fore = '__'.join(event_forecast) # double underscore to seperate events
                # check ff alerts
                if len(ff_alert)==0: ev_alert = 'none'
                else: 
                    event_ffalert = ['_'.join(ff_alert[i]) for i in range(len(ff_alert))]
                    if len(event_ffalert)==1: ev_alert = event_ffalert[0]
                    else:
                        if len(np.unique(event_ffalert))==1: ev_alert = '_'.join(['multiple',event_ffalert[0]])
                        else: ev_alert = '__'.join(event_ffalert) # double underscore to seperate events
                # check sentiments
                event_sentiment = ['_'.join(sentiment[i]) for i in range(len(sentiment))]
                if len(event_sentiment)==1: ev_sent = event_sentiment[0]
                else:
                    if len(np.unique(event_sentiment))==1: ev_sent = '_'.join(['multiple',event_sentiment[0]])
                    else: ev_sent = '__'.join(event_sentiment) # double underscore to seperate events
            entry['ind__%s_foreimpact' % cur] = ev_fore
            entry['ind__%s_ffalert' % cur] = ev_alert
            entry['ind__%s_nextevent_sent' % cur] = ev_sent
        # past
        # need to identify if still in the 'recent event' range - currently an hour
        for cur in ['base','quote']:
            if cur=='base': cur_ = base_events
            else: cur_ = quote_events
            if event_time_dict['past_event_%s' % cur] == 'none': 
                ev_act = 'none'
                ev_sent = 'none'
            else:
                # first check if there was an event in the past hour
                # don't need to check like forecast, because 'ind__{cur}_recentevent' indicator will be 0 if 'none' is present
                if entry['ind__%s_recentevent'  % cur][0] == 0: ev_act = 'none'
                else:
                    most_recent = [item for item in cur_ if type(item['time_until']) in [tuple]]
                    most_recent.sort(key=lambda val:val['time_until'][1])
                    # check if there are more than 1 events coming up
                    actual = [(act['impact_level'],act['actual_result']) for act in [event for event in most_recent \
                                                               if event['time_until'][1]==most_recent[0]['time_until'][1]]]
                    sentiment = [(sent['impact_level'],sent['event_sentiment']) for sent in [event for event in most_recent \
                                                                 if event['time_until'][1]==most_recent[0]['time_until'][1]]]
                    # check acutal
                    event_actual = ['_'.join(actual[i]) for i in range(len(actual))]
                    if len(event_actual)==1: ev_act = event_actual[0]
                    else:
                        if len(np.unique(event_actual))==1: ev_act = '_'.join(['multiple',event_actual[0]])
                        else: ev_act = '__'.join(event_actual) # double underscore to seperate events
                    # check sentiment
                    event_sentiment = ['_'.join(sentiment[i]) for i in range(len(sentiment))]
                    if len(event_sentiment)==1: ev_sent = event_sentiment[0]
                    else:
                        if len(np.unique(event_sentiment))==1: ev_sent = '_'.join(['multiple',event_sentiment[0]])
                        else: ev_sent = '__'.join(event_sentiment) # double underscore to seperate events
            entry['ind__%s_pastimpact' % cur] = ev_act
            entry['ind__%s_pastevent_sent' % cur] = ev_sent
        print('Done ...                                                                       Time elapsed:',round((time.time() - st), 2),'s')

# ---------------------------------------------------------------------------------------------------------------------------
        
        store_entry['entry_'+str(i)] = entry
        # save entry as the latest value to predict
        entry.to_csv(f'data/predict/to_predict_{SYMBOL}_{pos_time_int}_min_pos.csv')
        print('Entry ready for prediction.')
        # initialize to_pop list
        to_pop = []
        for key in store_entry.keys():
            ent = store_entry[key]
            # check the last 3 data entries just in case of a missed time entry due to delay in program
            for check_time in df[-3:].index:
                if (check_time - ent.index[0]) / np.timedelta64(1,'m') == pos_time_int:
                    result = df.loc[check_time:].Close[0] - ent.Close[0]
                    ent['expiration'] = check_time
                    ent['exch_rate_dif'] = result
                    if result > 0: ent['direction'] = 'up'
                    elif result < 0: ent['direction'] = 'down'
                    else: ent['direction'] = 'N/A'
                    to_path = f'../data/full_data/updated_apicall_data/ext_{SYMBOL}_train_{pos_time_int}_min_pos.csv'
                    search_path = '../data/full_data/updated_apicall_data/*'
                    check_file = f'../data/full_data/updated_apicall_data\\ext_{SYMBOL}_train_{pos_time_int}_min_pos.csv'
                    if check_file in glob.glob(search_path):
                        training_df = pd.read_csv(to_path, index_col='date')
                        t_df = pd.concat([training_df, ent])
                        t_df.to_csv(to_path)
                        print(SYMBOL,'Training Data shape:',t_df.shape,'\n')
                    else:
                        ent.to_csv(to_path)
                        print('Dataframe',to_path,'initialized ... \n')
                    to_pop.append(key)
                    break
        for item in to_pop:
            store_entry.pop(item)
        
        print('RUNTIME:',round(time.time()-st, 2),'seconds\n')
        time.sleep(5)
        i+=1

# Execute Datamining
[To top^^^](#Table-of-Contents:)

# MAKE SURE TO START PROGRAM ON THE EARLY SIDE OF A MINUTE WHEN DATAMINING

In [None]:
from IPython.display import clear_output
# %pdb on ##-- use this for debugging

def main(SYMBOL='EURJPY', pos_time_int=3): ## EURCAD AUDNZD
    create_training_data(SYMBOL=SYMBOL, pos_time_int=pos_time_int)

# use this to account for livecharts being down, wait to execute code until it is available
initial = datetime.datetime.now()
while True:
    clear_output(wait=True)
    try:
        requests.get('http://livecharts.co.uk/')
        print('Website up.')
        now = datetime.datetime.now()
        while now.second>8:
            now = datetime.datetime.now()
            time.sleep(0.5)
        # for other notebooks, uncomment below line
        # time.sleep(n) ## n is number of seconds to wait so that we don't overload at 0 second
        print('STARTING ...\n')
        main()
    except KeyboardInterrupt: raise
    except:
        print('Not up yet, waiting one minute.')
        time.sleep(60)

### Adjusted from full_datamining_framework
- new trendline analysis and indicators
- surfing sensitivity adjusted to 85% - was 75%

### adjustments made for new api call, twitter sentiment, and surfing indicator
- new api calls will be more accurate in the database
- added feature for number of tweets in past minute
- adjusted the surfing indicator feature -- made it a percent of where the close price is vs the range of the bollinger band
    - made it rolling