### Processing the files

In [None]:
import os
import utils
import pandas as pd
import DB.db as db
import prices
from IPython.display import clear_output

lastFiveYears = pd.to_datetime("2019-1-1").date()
files_processed = set()

In [None]:
folder_path = "../../Data_Formatted/csv/"
files = os.listdir(folder_path)
files.sort()
files_count = len(files)

files_processed_codes = [x[0] for x in db.getFilesTable()]
for f in files_processed_codes:
  files_processed.add(str(f) + '-1000.csv')
for i in range(350000, len(files), 1):
  files_processed.add(files[i])

In [None]:
stocks_set = set()
crypts_set = set()

def updateSets(df):
  new_tickers = []
  for symbs in df['symbols']:
    for symb in symbs:
      if '.' in symb:
        continue
      if 'USD' in symb:
        if symb not in crypts_set:
          new_tickers.append(symb)
          crypts_set.add(symb)
      else:
        if symb not in stocks_set:
          new_tickers.append(symb)
          stocks_set.add(symb)

  db.createStockPriceTables(new_tickers)
  db.insertToStockPriceTables(new_tickers)

def getAveragePriceForDate(ticker: str, date):
  response = db.getPriceForDate(ticker, date)
  if response == None or len(response) == 0:
    return -1
  return response[0][1]

def getAveragePriceForWeek(ticker, initial_date, weekNumber):
  current_date = (initial_date + pd.DateOffset(days = (weekNumber + 1) * 7)).date()
  return getAveragePriceForDate(ticker, current_date)

def user_exists(user_id: str):
  response = db.selectAccuracy(str(user_id), "accuracy_table")
  return response != None and len(response) > 0

def processBatch(df):
  def processSingleRow(rowNumber):
    currentRow = df.iloc[rowNumber].tolist()
    message_id = currentRow[0]
    user_id = currentRow[1]
    sentiment = currentRow[2]
    symbs = currentRow[3]
    initDate = currentRow[4]

    accuracy_table, alpha_table, counts_table = [str(user_id)], [str(user_id)], [str(user_id)]
    if user_exists(user_id):
      accuracy_table = list(db.selectAccuracy(str(user_id), 'accuracy_table')[0])
      alpha_table = list(db.selectAccuracy(str(user_id), 'alpha_table')[0])
      counts_table = list(db.selectAccuracy(str(user_id), 'counts_table')[0])
    else:
      for i in range(50):
        accuracy_table.append(0.0)
        alpha_table.append(0.0)
        counts_table.append(0)

    process = True
    for symb in symbs:
      if '.' in symb:
        continue
      initialPrice = float(getAveragePriceForDate(symb, initDate))
      if initialPrice == -1 or initialPrice == 0:
        process = False
        break
      if (symb in prices.topcryptos or symb in prices.topstocks) and initDate > lastFiveYears:
        db.insertIntoStockTable(symbs[0], str(message_id), str(user_id), str(sentiment), str(initDate))
      lastNoResponse = []
      for weekNumber in range(1, 51, 1):
        averageForWeek = getAveragePriceForWeek(symb, initDate, weekNumber)
        if averageForWeek == -1:
          lastNoResponse.append(weekNumber)
          if len(lastNoResponse) >= 2:
            if lastNoResponse[-1] - lastNoResponse[-2] == 1:
              break
          continue
        writtenAccuracyPoints = accuracy_table[weekNumber]
        writtenAlphaPoints = alpha_table[weekNumber]
        writtenCount = counts_table[weekNumber]

        currentAlphaPoint = float(sentiment) * (averageForWeek - initialPrice) / initialPrice
        currentAccuracyPoint = 1 if currentAlphaPoint > 0 else -1

        summationForAccuracy = (writtenCount * writtenAccuracyPoints) + currentAccuracyPoint
        summationForAlpha = (writtenCount * writtenAlphaPoints) + currentAlphaPoint

        counts_table[weekNumber] += 1
        accuracy_table[weekNumber] = summationForAccuracy / counts_table[weekNumber]
        alpha_table[weekNumber] = summationForAlpha / counts_table[weekNumber]
    
    if process:
      db.insert(accuracy_table, None, None)
      db.insert(None, alpha_table, None)
      db.insert(None, None, counts_table)

  for i in range(len(df)):
    processSingleRow(i)



In [None]:
folder_path = "../../Data_Formatted/csv/"
files = os.listdir(folder_path)
files.sort()
files_count = len(files)

df = utils.readData(folder_path + files[0])

currentRow = df.iloc[0].tolist()
message_id = currentRow[0]
user_id = currentRow[1]
sentiment = currentRow[2]
symbs = currentRow[3]
initDate = currentRow[4]


In [None]:
import concurrent.futures

def worker(index):
  if files[index] in files_processed:
    return
  files_processed.add(files[index])
  df = utils.readData(folder_path + files[index])
  if df.empty:
    return
  updateSets(df)
  processBatch(df)
  processed_message = "Processed file: " + str(index)
  clear_output(wait=False)
  print(processed_message)

pool = concurrent.futures.ThreadPoolExecutor(max_workers=1000)

for index in range(350000, 0, -1):
  pool.submit(worker(index))
 
pool.shutdown(wait=True)
 
print("Main thread continuing to run")

In [None]:
import DB.db as db
import prices
import pandas as pd

In [None]:
cryptos500 = list(prices.topcryptos)
stocks500 = list(prices.topstocks)

In [None]:
print(db.getPriceForDate(stocks500[0], pd.to_datetime("28.01.2022")))

In [None]:
toptickers = prices.topcryptos | prices.topstocks
print(len(toptickers))

In [None]:
progressCounter = 0
for ticker in toptickers:
  db.dropStockPriceTable(ticker)
  db.createStockPriceTables([ticker])
  db.insertToStockPriceTables([ticker])
  progressCounter += 1
  print(progressCounter)