### Formatting the files

In [None]:
import pandas as pd
import json
import os

def readRawData(path: str):
  df = pd.read_csv(path)
  return df

def cleanRawData(df):
  df = df.drop(columns=['message_body', 'parent_message_id', 'in_reply_to_message_id', 'prices'])
  df.dropna(subset=['sentiment'], inplace=True)
  df = df.drop(df[df['sentiment'] == ''].index)
  df = df.drop(df[df['sentiment'] == '[]'].index)
  df.dropna(subset=['symbols'], inplace=True)
  df = df.drop(df[df['symbols'] == ''].index)
  df = df.drop(df[df['symbols'] == '[]'].index)
  df.reset_index(drop=True, inplace=True)
  if (df.isnull().sum().sum() != 0):
    return pd.DataFrame()
  return df
  
def clearSymbolsColumn(df):
  if ('symbols' not in df.columns):
    return pd.DataFrame()
  symbolColumn = []
  for symbolDetails in df['symbols']:
    symbolDetailsList = json.loads(symbolDetails)
    symbols = []
    for symbolDetail in symbolDetailsList:
      symbol = symbolDetail['symbol']
      if '.X' in symbol:
        symbol = symbol.replace('.X','-USD')
      symbols.append(symbol)
    symbolColumn.append(symbols)
  clearDF = df.copy()
  clearDF['symbols'] = symbolColumn
  return clearDF

def refactorSentimentColumn(df):
  if ('sentiment' not in df.columns):
    return pd.DataFrame()
  sentimentColumn = []
  for sentimentDetails in df['sentiment']:
    if 'Bullish' in sentimentDetails:
      sentimentColumn.append(1)
    elif 'Bearish' in sentimentDetails:
      sentimentColumn.append(-1)
    else:
      print("Error: sentiment column contains unknown value")
      return pd.DataFrame()
  clearDF = df.copy()
  clearDF['sentiment'] = sentimentColumn
  return clearDF

def refactorDateColumn(df):
  if ('created_at' not in df.columns):
    return pd.DataFrame()
  df['date'] = pd.to_datetime(df['created_at'], format='%Y-%m-%dT%H:%M:%SZ')
  df = df.drop(columns=['created_at'])
    
  # remove weekends from dates
  if ('date' not in df.columns):
    return pd.DataFrame()
  dateColumn = []
  for date in df['date']:
    if date.weekday() == 5:
      date += pd.DateOffset(days = -1)
    if date.weekday() == 6:
      date += pd.DateOffset(days = -2)
    date = date.date()
    dateColumn.append(date)
  finalDF = df.copy()
  finalDF['date'] = dateColumn
  return finalDF

In [None]:
from IPython.display import clear_output

original_folder_path = "../Data_Recent/csv/"
destination_folder_path = "../Data_Formatted/csv/"
files = os.listdir(original_folder_path)
files_count = len(files)

files_processed = set()

created_count = 0
processed_count = 0
for file in files:
    files_processed.add(file)
    raw_dataframe = readRawData(original_folder_path + file)
    if raw_dataframe.empty:
      continue
    clean_dataframe = cleanRawData(raw_dataframe)
    if clean_dataframe.empty:
      continue
    clear_dataframe = clearSymbolsColumn(clean_dataframe)
    if clear_dataframe.empty:
      continue
    ready_dataframe = refactorSentimentColumn(clear_dataframe)
    if ready_dataframe.empty:
      continue
    mainDF = refactorDateColumn(ready_dataframe)
    if mainDF.empty:
      continue
    mainDF.set_index('message_id', inplace = True)
    mainDF = mainDF.sort_index()
    mainDF.to_csv(destination_folder_path + file, index = True)
    created_count += 1
    if len(files_processed) % 1000 == 0:
      clear_output(wait=False)
      processed_message = "Processed: " + str(len(files_processed)) + "/" + str(files_count)
      created_message = "Created: " + str(created_count)
      print(processed_message + " " + created_message)

In [None]:
raw_dataframe = readRawData('../Data_Formatted/csv/' + os.listdir('../Data_Formatted/csv/')[0])
raw_dataframe