In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import io
import requests
import datetime
import ondemand
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from scipy.stats import pearsonr
from plotly.offline import init_notebook_mode, iplot
from pandas.io.common import EmptyDataError
from requests.exceptions import HTTPError
from plotly.graph_objs import *
import plotly.io as pio
import time
import os

In [4]:
import argparse
from apiclient.discovery import build
#from oauth2client.service_account import ServiceAccountCredentials
import httplib2
from oauth2client import client
from oauth2client import file
from oauth2client import tools
####Google Analytics Module
class GA(object): 
    def __init__(self, start_date, end_date):
        self.page_view = None
        self.start_date = start_date
        self.end_date = end_date
    def initialize_analyticsreporting(self):
        """
          Initializes the analyticsreporting service object.

      Returns:
        analytics an authorized analyticsreporting service object.
        """
        SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
        DISCOVERY_URI = ('https://analyticsreporting.googleapis.com/$discovery/rest')
        CLIENT_SECRETS_PATH = 'client_secrets.json' # Path to client_secrets.json file.
      # Parse command-line arguments.
        parser = argparse.ArgumentParser(
          formatter_class=argparse.RawDescriptionHelpFormatter,
          parents=[tools.argparser])
        flags = parser.parse_args([])

      # Set up a Flow object to be used if we need to authenticate.
        flow = client.flow_from_clientsecrets(
          CLIENT_SECRETS_PATH, scope=SCOPES,
          message=tools.message_if_missing(CLIENT_SECRETS_PATH))

      # Prepare credentials, and authorize HTTP object with them.
      # If the credentials don't exist or are invalid run through the native client
      # flow. The Storage object will ensure that if successful the good
      # credentials will get written back to a file.
        storage = file.Storage('analyticsreporting.dat')
        credentials = storage.get()
        if credentials is None or credentials.invalid:
            credentials = tools.run_flow(flow, storage, flags)
        http = credentials.authorize(http=httplib2.Http())

      # Build the service object.
        analytics = build('analytics', 'v4', http=http, discoveryServiceUrl=DISCOVERY_URI)
        return analytics

    def get_report(self, analytics, symbol):
        VIEW_ID = '108608708'#barchart.com view_id
        #set filter pagePath according to different symbol.
        final_filter = 'ga:pagePath=~(?i)/stocks/quotes/{}'.format(symbol)
      # Use the Analytics Service Object to query the Analytics Reporting API V4.
        return analytics.reports().batchGet(
          body={
            'reportRequests': [
               {
              'viewId': VIEW_ID,
              'dateRanges': [{'startDate': self.start_date, 'endDate': self.end_date}],
              'metrics': [{'expression': 'ga:pageviews'}],
              'filtersExpression' : final_filter,
              'dimensions':
                     [
                        {
                         'name': 'ga:date'   
                        }                         
                     ]
                }
               ]
          }
      ).execute()
    def main(self, symbol):
        analytics = self.initialize_analyticsreporting()
        response = self.get_report(analytics, symbol)
        report = response.get('reports', [])
        final_result = {}
        try:
            #get our date and pageview results
            for item in report[0].get('data', {}).get('rows', []):
                date = item['dimensions'][0][:4] +'-'+ item['dimensions'][0][4:6]+\
                                          '-'+item['dimensions'][0][6:]
                final_result[date] = item['metrics'][0]['values']
            df = pd.DataFrame.from_dict(final_result, orient='index')
            df = df.reset_index()
            df = df.rename(index=str, columns={'index': 'date', 0: "page_views"})
            self.page_view = df
        except:
            print(symbol + ' does not have pageview!!!')

In [3]:
from multiprocessing import Pool as ThreadPool
from multiprocessing import Manager
class PAGE_VIEW(object):
    def __init__(self, start_date, end_date):
        #we need to catch error, if dates are illeagle
        self.start_date = start_date
        self.end_date = end_date
        self.page_views = Manager().dict()
    def get_history(self, symbols):
        mydata = {}
        for i, t in enumerate(symbols):
            url_history = 'https://ondemand.websol.barchart.com/getHistory.csv?apikey=OnDemand&symbol={}&type=daily&startDate={}&endDate={}&maxRecords=1000&interval=60&order=asc&sessionFilter=EFK&splits=true&dividends=true&volume=sum&nearby=1&jerq=true&exchange=NYSE%2CAMEX%2CNASDAQ&backAdjust=false&daysToExpiration=1&contractRoll=expiration'.\
                format(t, self.start_date, self.end_date)
            try:
                data_history = pd.read_csv(url_history)
                data_history = data_history.rename(columns={'tradingDay': 'date'})
                data_history = data_history.rename(columns={'close': 'close_price($)'})
                mydata[t] = data_history
                if data_history.empty:
                    print(t + ' has no history data during this period')
            except:
                print(t + ' has no history data during this period')
        return mydata
    ### mutiprocessing
    def parallel(self, item):
            ga = GA(self.start_date, self.end_date)
            ga.main(item)
            if ga.page_view is None:
                return
            self.page_views[item] = ga.page_view
    def get_page_view(self, all_symbols):
        pool = ThreadPool(10)
        pool.map(self.parallel, all_symbols)
        pool.close()
        pool.join()
    ###combine history and pageview into one dataframe
    def main(self, symbols, re_run = False):
        history = self.get_history(symbols)
        symbol_group = symbols[:]
        while symbol_group:
            if len(symbol_group) > 80:
                temp = symbol_group[:80]
                symbol_group = symbol_group[80:]
                self.get_page_view(temp)
                time.sleep(100)
            else:
                self.get_page_view(symbol_group)
                symbol_group = None
        page_view = self.page_views
        final_result={}
        if page_view:
            to_remove = []
            for item in symbols:
                try:
                    history[item]['date'] = history[item]['date'].apply(str)
                    page_view[item]['date'] = page_view[item]['date'].apply(str)
                    history[item]['date'] = history[item]['date'].str[:10]
                    temp = pd.merge(history[item], page_view[item])
                    final_result[item] = temp
                except KeyError as ke:
                    to_remove.append(item)
                    print('Skip ' + item+' !!!')
            for item in to_remove:
                symbols.remove(item)
        return final_result   

In [4]:
### Entry module, and we use this module plot graph, save local files
class Barchart(object):
    def __init__(self):
        self.myrawdata = None
        self.start_date = None
        self.end_date = None
    def get_earning_date(self, symbol):
        url_earning = 'https://ondemand.websol.barchart.com/getCorporateActions.csv?apikey=ondemand&symbols={}&startDate={}&endDate={}&eventType=earnings&maxRecords=5'.\
                       format(symbol, self.start_date, self.end_date)
        try:
            earning_date = pd.read_csv(url_earning)
            return earning_date
        except EmptyDataError as ede:
            print(symbol + ' does not have earning_date during this period!!!')
            return pd.DataFrame([])
    # get outstanding shares for maket value calculation       
    def get_shares(self, symbol):

        url = 'https://ondemand.websol.barchart.com/getFinancialHighlights.csv?apikey=OnDemand&symbols={}&fields=lastQtrEPS%2CannualEPS%2CttmEPS'.\
                format(symbol)
        try:
            fundmantal = pd.read_csv(url)
            if fundmantal.empty:
                print(symbol + ' does not have public market value!!!')
            else:
                return fundmantal.loc[0, 'sharesOutstanding']
        except HTTPError as he:
            print('getFinancialHighlights API Error!!! Run again')
    # In order to plot highest correlation coefficient and lowest correlation coefficient symbol, we need to
    #separate_data and sort them.
    def separate_data(self, my_csv, key):
        plot_negative = my_csv.copy()
        plot_positive = my_csv.copy()
        plot_positive = plot_positive[~(plot_positive[key] < 0)]
        plot_negative = plot_negative[(plot_negative[key] < 0)]
        plot_negative = plot_negative.sort_values(by = key)
        plot_positive = plot_positive.sort_values(by = key, ascending = False)
        plot_negative = plot_negative.reset_index(drop=True)
        plot_positive = plot_positive.reset_index(drop = True)
        return (plot_positive, plot_negative)
    def plot_graph(self, plot_positive, plot_negative, x, y1, y2, corr = None, key = None, pageviewfiltered = False):
        root = ''
        if pageviewfiltered:
            root = 'pageViewFiltered/'
        if not plot_positive.empty:
            plot2 = plot_positive.loc[0, 'symbol']
            earning_date = self.get_earning_date(plot2)
            data_list = []
            shapes = []
            annotations = []
            if  not earning_date.empty:
                data_list = list(earning_date['eventDate'])
            else:
                shapes = None
                annotations = None   
            for item in data_list:
                temp_shape = {
                        'type': 'line',
                        'xref': 'x',
                        'yref': 'paper',
                        'x0': item,
                        'y0': 0,
                        'x1': item,
                        'y1': 1,
                        'line': {
                            'color': 'rgb(193, 191, 191)',
                            'width': 1.5,
                                }
                    
                    }
                temp_annotations = {
                         'x' : item,
                         'y' : 1,
                        'xref' : 'x',
                        'yref' : 'paper',
                        'text' : 'Earning Date: ' + item,
                        'showarrow' : True,
                        'arrowhead' : 7,
                        'ax' : 0,
                        'ay' : -40
                    
                                    }
                shapes.append(temp_shape)
                annotations.append(temp_annotations)
            #print(shapes) #test1
            mydf = self.myrawdata[plot2].copy()
            mydf['date_time']= pd.to_datetime(mydf['date'])
            #myrawdata[plot2] = myrawdata[plot2].set_index(myrawdata[plot2]['date_time'])
            trace1 = go.Scatter(
                x = mydf[x],
                y = mydf[y1],
                name= y1
            )
            trace2 = go.Scatter(
                x = mydf[x],
                y = mydf[y2],
                name= y2,
                yaxis='y2'
            )
            data = [trace1, trace2]
            layout = go.Layout(
                autosize=True,
                title='Correlation Compare for ' + plot2 + ' R = '+\
                           str(plot_positive.loc[0, corr])[:7],
                xaxis = dict(title = 'Date',
                             autorange=True,
                            showline=True,
                            showticklabels=True),
                yaxis=dict(
                        title= y1
                        ),
                yaxis2=dict(
                        title= y2,
                        titlefont=dict(
                            color='rgb(148, 103, 189)'
                            ),
                        tickfont=dict(
                        color='rgb(148, 103, 189)'
                            ),
                        overlaying='y',
                        side='right'
                        ),
                shapes = shapes,
                annotations = annotations
            )
            fig = go.Figure(data=data, layout=layout)
            path = root + self.start_date + ' to ' + self.end_date + ' ' + key +'/Top Positive'+ '/Correlation Compare for ' + plot2 + ' R = '+ str(plot_positive.loc[0, corr])[:7] + '/'
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            url = plotly.offline.plot(fig, filename= path + plot2 + ' ' + corr +'.html', auto_open=False)
           
        else:
            path = root + self.start_date + ' to ' + self.end_date + ' ' + key + '/Top Positive' + '/'+ 'We did not find positive correlation coefficient!!!' +'/'
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            print('We did not find positive correlation coefficient!!!')
            
        if not plot_negative.empty: 
            plot1 = plot_negative.loc[0, 'symbol']
            earning_date = self.get_earning_date(plot1)
            data_list = []
            shapes = []
            annotations = []
            if not earning_date.empty:
                data_list = list(earning_date['eventDate'])
            else:
                shapes = None
                annotations = None 
            for item in data_list:
                temp_shape = {
                        'type': 'line',
                        'xref': 'x',
                        'yref': 'paper',
                        'x0': item,
                        'y0': 0,
                        'x1': item,
                        'y1': 1,
                        'line': {
                            'color': 'rgb(193, 191, 191)',
                            'width': 1.5,
                                }
                    
                            }
                temp_annotations = {
                         'x' : item,
                         'y' : 1,
                        'xref' : 'x',
                        'yref' : 'paper',
                        'text' : 'Earning Date: ' + item,
                        'showarrow' : True,
                        'arrowhead' : 7,
                        'ax' : 0,
                        'ay' : -40
                    
                                    }
                shapes.append(temp_shape)
                annotations.append(temp_annotations)
            mydf = self.myrawdata[plot1].copy()
            mydf['date_time']= pd.to_datetime(mydf['date'])
            trace1 = go.Scatter(
                x = mydf[x],
                y = mydf[y1],
                name= y1
            )
            trace2 = go.Scatter(
                x = mydf[x],
                y = mydf[y2],
                name= y2,
                yaxis='y2'
            )
            data = [trace1, trace2]
            layout = go.Layout(
                title='Correlation Compare for ' + plot1 + ' R = '+\
                           str(plot_negative.loc[0, corr])[:7],
                xaxis = dict(title = 'Date',
                            autorange=True,
                            showline=True,
                            showticklabels=True),
                yaxis=dict(
                        title= y1
                        ),
                yaxis2=dict(
                        title= y2,
                        titlefont=dict(
                            color='rgb(148, 103, 189)'
                            ),
                tickfont=dict(
                        color='rgb(148, 103, 189)'
                            ),
                overlaying='y',
                side='right'
                ),
                shapes = shapes,
                annotations = annotations
            )
            fig = go.Figure(data=data, layout=layout)
            path = root + self.start_date + ' to ' + self.end_date + ' ' + key +'/Lowest Negative'+'/Correlation Compare for ' + plot1 + ' R = '+ str(plot_negative.loc[0, corr])[:7] +'/'
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            url = plotly.offline.plot(fig, filename= path + plot1 + ' ' + corr + '.html', auto_open=False)
        else:
            path = root + self.start_date + ' to ' + self.end_date + ' ' + key +'/Lowest Negative'+'/'+'We did not find negative correlation coefficient!!!'+'/'
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            print('We did not find negative correlation coefficient!!!')
    def plot_graph_re_run(self, symbols, x, y1, y2, my_csv, corr = None, key = None, pageviewfiltered = False):
        root = ''
        if pageviewfiltered:
            root = 'pageViewFiltered/'
        count = 0
        for item in symbols:
            mydf = self.myrawdata[item].copy()
            mydf['date_time']= pd.to_datetime(mydf['date'])
            trace1 = go.Scatter(
                x = mydf[x],
                y = mydf[y1],
                name= y1
            )
            trace2 = go.Scatter(
                x = mydf[x],
                y = mydf[y2],
                name= y2,
                yaxis='y2'
            )
            data = [trace1, trace2]
            layout = go.Layout(
                autosize=True,
                title='Correlation Compare for ' + item + ' R = '+\
                           str(my_csv.loc[count, corr])[:7],
                xaxis = dict(title = 'Date',
                             autorange=True,
                            showline=True,
                            showticklabels=True),
                yaxis=dict(
                        title= y1
                        ),
                yaxis2=dict(
                        title= y2,
                        titlefont=dict(
                            color='rgb(148, 103, 189)'
                            ),
                        tickfont=dict(
                        color='rgb(148, 103, 189)'
                            ),
                        overlaying='y',
                        side='right'
                        ),
            )
            fig = go.Figure(data=data, layout=layout)
            path = root + self.start_date + ' to ' + self.end_date + ' ' + key + '/Correlation Compare for ' + item + ' R = '+ str(my_csv.loc[count, corr])[:7] + '/'
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            url = plotly.offline.plot(fig, filename = path + '/' + item + ' ' + corr +'.html', auto_open=False)
            count += 1
    def main(self, symbols, start_date, end_date, re_run = False, pageviewfiltered = False):
        """
        
        symbols = input('Please input symbols, example: AMZN, AAPL...: ')
        temp = symbols.split(',')
        symbols = []
        for item in temp:
            symbols.append(item.strip().upper())
        start_date = input('Please input start date, example:2018-06-06: ')
        end_date = input('Please input end date, example:2018-06-06: ')
        self.start_date = start_date.strip()
        self.end_date = end_date.strip()
        now = datetime.datetime.utcnow()
        if len(start_date) != 10 or len(end_date) != 10:
            print('Your date format is wrong, please try again.')
            return
        check_start = datetime.datetime(int(start_date[:4]), int(start_date[5:7]), int(start_date[8:]))
        check_end = datetime.datetime(int(end_date[:4]), int(end_date[5:7]), int(end_date[8:]))
        if check_start > check_end or check_end > now:
            print('Your date is wrong, please try again.')
            return
        """
        self.start_date = start_date
        self.end_date = end_date
        pv = PAGE_VIEW(self.start_date, self.end_date)
        #getting page_view and history data
        if not pv:
            return
        self.myrawdata = pv.main(symbols)
        for item in symbols:
            self.myrawdata[item]['page_views'] = self.myrawdata[item]['page_views'].apply(int)
            df = self.myrawdata[item]
            delta_close = df['close_price($)'].diff()
            delta_pageview = df['page_views'].diff()
            #delta_volume = df['volume'].diff()
            # I decide to use mean to fill the first Nan value
            delta_close[0] = delta_close.mean()
            delta_pageview[0] = delta_pageview.mean()
            #delta_volume[0] = delta_volume.mean()
            #mean_close = df['close_price($)'].apply(float).mean()
            #close_price_account_for_avg = df['close_price($)'].apply(float)/mean_close
            #mean_volume = df['volume'].apply(float).mean()
            #volume_account_for_avg = df['volume'].apply(float)/mean_volume
            #mean_pageviews = df['page_views'].apply(int).mean()
            #pageviews_account_for_avg = df['page_views'].apply(int)/mean_pageviews
            df['delta_close_price($)'] = abs(delta_close)
            df['delta_pageview'] = delta_pageview
            #df['delta_volume'] = abs(delta_volume)
            #df['close_price_account_for_avg'] = close_price_account_for_avg
            #df['volume_account_for_avg'] = volume_account_for_avg
            #df['pageviews_account_for_avg'] = pageviews_account_for_avg
        if pageviewfiltered:### filter rawdata
            to_remove = []
            for item in symbols:
                mean_pageviews = self.myrawdata[item]['page_views'].apply(int).mean()
                if mean_pageviews < 30:
                    to_remove.append(item)
            for symb in to_remove:
                symbols.remove(symb)
                self.myrawdata.pop(symb, None)
        my_csv = pd.DataFrame([], columns = ['symbol', 'corr_pageview_close', 'corr_pageview_volume', \
                                             'corr_delta_close_pageview', 'corr_delta_pageview_volume', \
                                             'corr_delta_pageview_delta_close', 'mean_pageviews', \
                                             'start_date', 'end_date',\
                                             'frequency', 'price at the end_date', 'mkt_val(billion)'])
        if re_run:# remove first 7 days and last 7 days
            for symb in symbols:
                earning_date = self.get_earning_date(symb)
                earning_date = list(earning_date.eventDate)
                for item in earning_date:
                    date_type = datetime.datetime(int(item[:4]), int(item[5:7]), int(item[8:]))
                    mid = date_type
                    left = mid
                    right = mid
                    self.myrawdata[symb] = self.myrawdata[symb][self.myrawdata[symb].date != str(mid)[:10]]
                    for i in range(7):
                        left -= datetime.timedelta(days=1)
                        right += datetime.timedelta(days=1)
                        self.myrawdata[symb] = self.myrawdata[symb][self.myrawdata[symb].date != str(left)[:10]]
                        self.myrawdata[symb] = self.myrawdata[symb][self.myrawdata[symb].date != str(right)[:10]]
                self.myrawdata[symb] = self.myrawdata[symb].reset_index(drop = True)
                    
        init_notebook_mode(connected=True)
        ### building my_csv
        for item in symbols:
            shares = self.get_shares(item)
            if shares is None:
                return
            shares = float(shares)/1000000
            price = self.myrawdata[item].loc[self.myrawdata[item].shape[0]-1, 'close_price($)']
            price_float = float(price)
            mkt_val = str(shares*price_float)[:6]
            if self.myrawdata[item].empty:
                continue
            mean_pageviews = self.myrawdata[item]['page_views'].apply(int).mean()
            corr_page_close = pearsonr(self.myrawdata[item]['page_views'], self.myrawdata[item]['close_price($)'])
            corr_page_volume = pearsonr(self.myrawdata[item]['page_views'], self.myrawdata[item]['volume'])
            corr_delta_close_pageview = pearsonr(self.myrawdata[item]['delta_close_price($)'], self.myrawdata[item]['page_views'])
            ### 'corr_delta_pageview_delta_close', 'corr_delta_pageview_volume'
            corr_delta_pageview_delta_close = pearsonr(self.myrawdata[item]['delta_pageview'], self.myrawdata[item]['delta_close_price($)'])
            corr_delta_pageview_volume = pearsonr(self.myrawdata[item]['delta_close_price($)'], self.myrawdata[item]['volume'])
            #corr_volume_pageviews_account_for_avg = pearsonr(self.myrawdata[item]['delta_volume'],\
                                                                 #self.myrawdata[item]['pageviews_account_for_avg'])
            #corr_close_price_account_for_avg_pageviews_account_for_avg = pearsonr(self.myrawdata[item]['close_price_account_for_avg'],\
                                                                 #self.myrawdata[item]['pageviews_account_for_avg'])
            #corr_volume_account_for_avg_pageviews_account_for_avg = pearsonr(self.myrawdata[item]['volume_account_for_avg'],\
                                                                 #self.myrawdata[item]['pageviews_account_for_avg'])
            my_csv = my_csv.append({'symbol': item, 'corr_pageview_close' : corr_page_close[0], \
                          'corr_pageview_volume' : corr_page_volume[0], 'corr_delta_close_pageview' : corr_delta_close_pageview[0],\
                                    'corr_delta_pageview_delta_close' : corr_delta_pageview_delta_close[0],\
                                    'corr_delta_pageview_volume' : corr_delta_pageview_volume[0],\
                                    'mean_pageviews' : mean_pageviews, 'start_date' : start_date, \
                          'end_date': end_date, 'frequency' : 'daily', 'price at the end_date':\
                                    price, 'mkt_val(billion)':\
                                   mkt_val}, ignore_index=True)
        if not re_run:
            print('Symbols in four graphs below chosen based on data sorted by corr_pageview_close!!!')
            data_sort_by_corr_pageview_close = self.separate_data(my_csv, key = 'corr_pageview_close')
            self.plot_graph(data_sort_by_corr_pageview_close[0], data_sort_by_corr_pageview_close[1], x = 'date_time',\
                            y1 = 'page_views', y2 = 'close_price($)', corr = 'corr_pageview_close', key = 'sorted by corr_pageview_close', pageviewfiltered = pageviewfiltered)
            self.plot_graph(data_sort_by_corr_pageview_close[0], data_sort_by_corr_pageview_close[1], x = 'date_time',\
                            y1 = 'page_views', y2 = 'volume', corr = 'corr_pageview_volume', key = 'sorted by corr_pageview_close', pageviewfiltered = pageviewfiltered)
            #print(data_sort_by_corr_pageview_close[0])# test 1
            if data_sort_by_corr_pageview_close[0].empty:
                sorted_by_corr_pageview_close_top_postive = None
            else:
                sorted_by_corr_pageview_close_top_postive = data_sort_by_corr_pageview_close[0].loc[0, 'symbol']
            if data_sort_by_corr_pageview_close[1].empty:
                sorted_by_corr_pageview_close_negative = None
            else:
                sorted_by_corr_pageview_close_negative = data_sort_by_corr_pageview_close[1].loc[0, 'symbol']
            print('Symbols in four graphs below chosen based on data sorted by corr_pageview_volume!!!')
            data_sort_by_corr_pageview_volume = self.separate_data(my_csv, key = 'corr_pageview_volume')
            self.plot_graph(data_sort_by_corr_pageview_volume[0], data_sort_by_corr_pageview_volume[1], x = 'date_time',\
                            y1 = 'page_views', y2 = 'close_price($)', corr = 'corr_pageview_close', key = 'sorted by corr_pageview_volume', pageviewfiltered = pageviewfiltered)
            self.plot_graph(data_sort_by_corr_pageview_volume[0], data_sort_by_corr_pageview_volume[1], x = 'date_time',\
                            y1 = 'page_views', y2 = 'volume', corr = 'corr_pageview_volume', key = 'sorted by corr_pageview_volume', pageviewfiltered = pageviewfiltered)
            self.plot_graph(data_sort_by_corr_pageview_volume[0], data_sort_by_corr_pageview_volume[1], x = 'date_time',\
                            y1 = 'page_views', y2 = 'delta_close_price($)', corr = 'corr_delta_close_pageview', key = 'sorted by corr_pageview_volume', pageviewfiltered = pageviewfiltered)
            if data_sort_by_corr_pageview_volume[0].empty:
                sorted_by_corr_pageview_volume_top_postive = None
            else:
                sorted_by_corr_pageview_volume_top_postive = data_sort_by_corr_pageview_volume[0].loc[0, 'symbol']
            if data_sort_by_corr_pageview_volume[1].empty:
                sorted_by_corr_pageview_volume_negative = None
            else:
                sorted_by_corr_pageview_volume_negative = data_sort_by_corr_pageview_volume[1].loc[0, 'symbol']
            print('Symbols in four graphs below chosen based on data sorted by corr_delta_close_pageview!!!')
            data_sort_by_corr_delta_close_pageview = self.separate_data(my_csv, key = 'corr_delta_close_pageview')
            self.plot_graph(data_sort_by_corr_delta_close_pageview[0], data_sort_by_corr_delta_close_pageview[1], x = 'date_time',\
                            y1 = 'page_views', y2 = 'delta_close_price($)', corr = 'corr_delta_close_pageview', key = 'sorted by corr_delta_close_pageview', pageviewfiltered = pageviewfiltered)
            self.plot_graph(data_sort_by_corr_delta_close_pageview[0], data_sort_by_corr_delta_close_pageview[1], x = 'date_time',\
                            y1 = 'page_views', y2 = 'volume', corr = 'corr_pageview_volume', \
                            key = 'sorted by corr_delta_close_pageview', pageviewfiltered = pageviewfiltered)
            if data_sort_by_corr_delta_close_pageview[0].empty:
                sorted_by_corr_delta_close_pageview_top_postive = None
            else:
                sorted_by_corr_delta_close_pageview_top_postive = data_sort_by_corr_delta_close_pageview[0].loc[0, 'symbol']
            if data_sort_by_corr_delta_close_pageview[1].empty:
                sorted_by_corr_delta_close_pageview_negative = None
            else:
                sorted_by_corr_delta_close_pageview_negative = data_sort_by_corr_delta_close_pageview[1].loc[0, 'symbol']
            print('Symbols in four graphs below chosen based on data sorted by corr_delta_pageview_delta_close!!!')
            data_sort_by_corr_delta_pageview_delta_close = self.separate_data(my_csv, key = 'corr_delta_pageview_delta_close')
            self.plot_graph(data_sort_by_corr_delta_pageview_delta_close[0], data_sort_by_corr_delta_pageview_delta_close[1], x = 'date_time',\
                            y1 = 'delta_pageview', y2 = 'delta_close_price($)', corr = 'corr_delta_pageview_delta_close', key = 'sorted by corr_delta_pageview_delta_close', pageviewfiltered = pageviewfiltered)
            self.plot_graph(data_sort_by_corr_delta_pageview_delta_close[0], data_sort_by_corr_delta_pageview_delta_close[1], x = 'date_time',\
                            y1 = 'delta_pageview', y2 = 'volume', corr = 'corr_delta_pageview_volume', \
                            key = 'sorted by corr_delta_pageview_delta_close', pageviewfiltered = pageviewfiltered)
            if data_sort_by_corr_delta_pageview_delta_close[0].empty:
                sorted_by_corr_delta_pageview_delta_close_top_postive = None
            else:
                sorted_by_corr_delta_pageview_delta_close_top_postive = data_sort_by_corr_delta_pageview_delta_close[0].loc[0, 'symbol']
            if data_sort_by_corr_delta_pageview_delta_close[1].empty:
                sorted_by_corr_delta_pageview_delta_close_negative = None
            else:
                sorted_by_corr_delta_pageview_delta_close_negative = data_sort_by_corr_delta_pageview_delta_close[1].loc[0, 'symbol']
            print('Symbols in four graphs below chosen based on data sorted by corr_delta_pageview_volume!!!')
            data_sort_by_corr_delta_pageview_volume = self.separate_data(my_csv, key = 'corr_delta_pageview_volume')
            self.plot_graph(data_sort_by_corr_delta_pageview_volume[0], data_sort_by_corr_delta_pageview_volume[1], x = 'date_time',\
                            y1 = 'delta_pageview', y2 = 'delta_close_price($)', corr = 'corr_delta_pageview_delta_close', key = 'sorted by corr_delta_pageview_volume', pageviewfiltered = pageviewfiltered)
            self.plot_graph(data_sort_by_corr_delta_pageview_delta_close[0], data_sort_by_corr_delta_pageview_delta_close[1], x = 'date_time',\
                            y1 = 'delta_pageview', y2 = 'volume', corr = 'corr_delta_pageview_volume', \
                            key = 'sorted by corr_delta_pageview_volume', pageviewfiltered = pageviewfiltered)
            if data_sort_by_corr_delta_pageview_volume[0].empty:
                sorted_by_corr_delta_pageview_volume_top_postive = None
            else:
                sorted_by_corr_delta_pageview_volume_top_postive = data_sort_by_corr_delta_pageview_volume[0].loc[0, 'symbol']
            if data_sort_by_corr_delta_pageview_volume[1].empty:
                sorted_by_corr_delta_pageview_volume_negative = None
            else:
                sorted_by_corr_delta_pageview_volume_negative = data_sort_by_corr_delta_pageview_volume[1].loc[0, 'symbol']
            root = ''
            if pageviewfiltered:
                if not os.path.exists(os.path.dirname('pageViewFiltered/')):
                    os.makedirs(os.path.dirname('pageViewFiltered/'))
                root = 'pageViewFiltered/new_'
            path = root + 'results_total.csv'
            my_csv.to_csv(path)
            return (my_csv, sorted_by_corr_pageview_close_top_postive, sorted_by_corr_pageview_close_negative,\
                    sorted_by_corr_pageview_volume_top_postive, sorted_by_corr_pageview_volume_negative,\
                   sorted_by_corr_delta_close_pageview_top_postive, sorted_by_corr_delta_close_pageview_negative,\
                   sorted_by_corr_delta_pageview_delta_close_top_postive, sorted_by_corr_delta_pageview_delta_close_negative,\
                   sorted_by_corr_delta_pageview_volume_top_postive, sorted_by_corr_delta_pageview_volume_negative)
        else:
            self.plot_graph_re_run(symbols, my_csv = my_csv, x = 'date_time', y1 = 'page_views', y2 = 'close_price($)', corr = 'corr_pageview_close', key = 're_run', pageviewfiltered = pageviewfiltered)
            self.plot_graph_re_run(symbols, my_csv = my_csv, x = 'date_time', y1 = 'page_views', y2 = 'volume', corr = 'corr_pageview_volume', key = 're_run', pageviewfiltered = pageviewfiltered)
            self.plot_graph_re_run(symbols, my_csv = my_csv, x = 'date_time', y1 = 'page_views', y2 = 'delta_close_price($)', corr = 'corr_delta_close_pageview', key = 're_run', pageviewfiltered = pageviewfiltered)
            self.plot_graph_re_run(symbols, my_csv = my_csv, x = 'date_time', y1 = 'delta_pageview', y2 = 'delta_close_price($)', corr = 'corr_delta_pageview_delta_close', key = 're_run', pageviewfiltered = pageviewfiltered)
            self.plot_graph_re_run(symbols, my_csv = my_csv, x = 'date_time', y1 = 'delta_pageview', y2 = 'volume', corr = 'corr_delta_pageview_volume', key = 're_run', pageviewfiltered = pageviewfiltered)
            root = ''
            if pageviewfiltered:
                if not os.path.exists(os.path.dirname('pageViewFiltered/')):
                    os.makedirs(os.path.dirname('pageViewFiltered/'))
                root = 'pageViewFiltered/new_'
            path = root + 'results_re_run.csv'
            my_csv.to_csv(path)  
            
            
            

In [5]:
sp500 = pd.read_csv('https://ondemand.websol.barchart.com/getETFConstituents.csv?apikey=ondemand&symbol=SPY')

In [6]:
symbols = []
temp = list(sp500.symbol)
for item in temp:
    if isinstance(item, str):
        symbols.append(item)

In [9]:
test = Barchart()
start = '2017-08-09'
end = '2018-08-10'
results_total = test.main(symbols, start, end)

Symbols in four graphs below chosen based on data sorted by corr_pageview_close!!!
Symbols in four graphs below chosen based on data sorted by corr_pageview_volume!!!
Symbols in four graphs below chosen based on data sorted by corr_delta_close_pageview!!!
Symbols in four graphs below chosen based on data sorted by corr_delta_pageview_delta_close!!!
Symbols in four graphs below chosen based on data sorted by corr_delta_pageview_volume!!!
We did not find negative correlation coefficient!!!


In [10]:
results_total[0]['corr_delta_pageview_delta_close'].describe()

count    503.000000
mean       0.242265
std        0.149409
min       -0.103476
25%        0.130347
50%        0.228659
75%        0.348418
max        0.719004
Name: corr_delta_pageview_delta_close, dtype: float64

In [11]:
results_total[0]['corr_delta_pageview_volume'].describe()

count    503.000000
mean       0.484101
std        0.151704
min        0.000245
25%        0.383173
50%        0.494681
75%        0.591100
max        0.834469
Name: corr_delta_pageview_volume, dtype: float64

In [27]:
re_run_symbol = set()
for i in range(1, len(results_total)):
    if results_total[i]:
        re_run_symbol.add(results_total[i])
re_run_symbol = list(re_run_symbol)

In [28]:
re_run_test = Barchart()
re_run_res = re_run_test.main(re_run_symbol, start, end, re_run = True)

In [17]:
### page_view filtered
new_test = Barchart()
start = '2017-08-09'
end = '2018-08-10'
new_results_total = new_test.main(symbols, start, end, pageviewfiltered = True)

Symbols in four graphs below chosen based on data sorted by corr_pageview_close!!!
Symbols in four graphs below chosen based on data sorted by corr_pageview_volume!!!
Symbols in four graphs below chosen based on data sorted by corr_delta_close_pageview!!!
Symbols in four graphs below chosen based on data sorted by corr_delta_pageview_delta_close!!!
Symbols in four graphs below chosen based on data sorted by corr_delta_pageview_volume!!!
We did not find negative correlation coefficient!!!


In [18]:
new_results_total[0]['corr_delta_pageview_delta_close'].describe()

count    179.000000
mean       0.305836
std        0.132981
min       -0.098406
25%        0.214360
50%        0.307800
75%        0.392746
max        0.698904
Name: corr_delta_pageview_delta_close, dtype: float64

In [19]:
new_results_total[0]['corr_delta_pageview_volume'].describe()

count    179.000000
mean       0.565890
std        0.122111
min        0.173567
25%        0.496752
50%        0.582894
75%        0.644516
max        0.834469
Name: corr_delta_pageview_volume, dtype: float64

In [20]:
new_re_run_symbol = set()
for i in range(1, len(new_results_total)):
    if new_results_total[i]:
        new_re_run_symbol.add(new_results_total[i])
new_re_run_symbol = list(new_re_run_symbol)

In [24]:
new_re_run_symbol

['EFX', 'GIS', 'SYMC', 'FDX', 'ABMD', 'ICE']

In [23]:
new_re_run_test = Barchart()
new_re_run_res = new_re_run_test.main(new_re_run_symbol, start, end, re_run = True, pageviewfiltered = True)