In [None]:
# #############################################################
# DATA SCIENCE CHALLANGE 
# 2021
# Lorena Gril, David Rackl
# #############################################################

# =================
# Load libraries 
# =================

install.packages('tidyverse')
install.packages('dplyr')

library('ggplot2')
library('dplyr')

# =================
# Load data 
# =================

# _ Read the csv files
tweets_sentiment <- read.csv(file = 'tweet_sentiment.csv')
stockerbot <- read.csv(file = 'stockerbot-export1.csv')

# __ Combine data 
tweets <- cbind(tweets_sentiment, stockerbot)

# __ Show the structure of the data 
head(tweets)
tail(tweets)

# __ Filter the data accoring to our "plans"
data <- tweets %>%
  select(c(timestamp, sentiment,company_names, symbols, source))


# =========================================================
# Preprocessing the time stamps of the data 
# =========================================================

# Choose format that we want to work with
format(Sys.time(), "%a %b %d %T %z %Y")

lct <- Sys.getlocale("LC_TIME")
Sys.setlocale("LC_TIME", "C")

# _ reformutation of the time - save in vector 
new_time <- strptime(data$timestamp, 
                     "%a %b %d %H:%M:%S %z %Y")
head(new_time)

# __ Add neu time stamps and remove "old" ones
data <- data %>%
  mutate(time = new_time) %>%
  select(-timestamp)

# __ Check time range
time_max <- which.max(as.POSIXct(new_time))
time_min <- which.min(as.POSIXct(new_time))

#_ Round the time wrt hours 

new_time_rounded <- format(round(new_time, units="hours"), format="%Y-%m-%d %H:%M:%S")
head(new_time_rounded)

#__ add hourly data 

data <- data %>%
  mutate(time_rounded = new_time_rounded) 

# _ Delete data without time stamp

sum(is.na(data$time_rounded))
which(is.na(data$time_rounded))

data <- data %>%
  slice(-c(which(is.na(data$time_rounded))))

time_max <- which.max(as.POSIXct(data$time_rounded))
time_min <- which.min(as.POSIXct(data$time_rounded))

# _ Hourly time range
data$time_rounded[time_min]
data$time_rounded[time_max]

# Vector of all hourly times in the range
unique_time_stamps <- seq(as.POSIXct(data$time_rounded[time_min]), 
                          as.POSIXct(data$time_rounded[time_max]), 
                          by="hour")
head(unique_time_stamps)

# add new col to data called id
data <- data %>%
  mutate(id = rep(0, dim(data)[1]))


# Id according to the dates 
# This can be done but is at the moment not needed
# Very slow 

# for(i in 1:dim(data)[1]){
#   for(j in 1:length(unique_time_stamps)){
#     if(data$time_rounded[i] == unique_time_stamps[j]){
#       data$id[i] <- j
#     }
#   }
# }

# ======================================================
# Data selection
# ======================================================

# Select the data from which a range of 6 days is given

# _ group by company symbol
min_max_entry_each <- 
  data %>% 
  group_by(symbols) %>%
  summarise("min_time" = time_rounded[which.min(as.POSIXct(time_rounded))], 
            "max_time" = time_rounded[which.max(as.POSIXct(time_rounded))])

# _ Empty string array
more_than_7_days_info <- c()

# More than 6 days are required 
for (i in 1:(dim(min_max_entry_each)[1])){
  if(abs(as.POSIXct(min_max_entry_each$max_time[i]) - 
         as.POSIXct(min_max_entry_each$min_time[i])) > 6){
    
    more_than_7_days_info <- c(more_than_7_days_info, 
                               min_max_entry_each$symbols[i])
  }
}

sentiments_list_more_than_7_days <- data.frame(
  time_rounded = character(), 
  sent_round = numeric(), 
  symbol = character()
)

symb_with_const <- c()

# selection criterion is that the seniments are more than 20 times not neural 

for(symb in more_than_7_days_info){
  
  tmp_data <- data %>% filter(symbols == symb)
  
  mean_sentiments_tmp <- tmp_data %>%
    group_by(time_rounded) %>%
    summarise("sent_round" = round(mean(sentiment)))%>%
    mutate("symb" = symb)
  
  if(sum(abs(mean_sentiments_tmp$sent_round)) > 20){
    sentiments_list_more_than_7_days <- rbind(sentiments_list_more_than_7_days,
                                              mean_sentiments_tmp)
    symb_with_const <- c(symb_with_const, symb)
  }
}
# ==========================
# Consider the data per hour 
# ==========================

# Hourly Plots for selected data 

# Array of Company names 
stock_names_selected <- c()

# Data Frame - iterativly filled
sentiment_hourly <- data.frame(
  time_rounded = character(), 
  sent_round = numeric(), 
  symbol = character(), 
  id = numeric()
)

# Interate over selected companys (wrt criterion > 6 days, > 20 non zero senti)
for(sym in symb_with_const){
  
  # __ Temporal data frame
  tmp_data <- sentiments_list_more_than_7_days %>%
    filter(symb == sym) %>%
    mutate("id" = 0)
  
  # For better plotting used id 
  for(i in 1:dim(tmp_data)[1]){
    
    boundrary <- min_max_entry_each %>% filter(symbols == sym)
    min_search <- which(unique_time_stamps == boundrary$min_time)
    max_search <- which(unique_time_stamps == boundrary$max_time)
    
    for(j in min_search:max_search){
      if(tmp_data$time_rounded[i] == unique_time_stamps[j]){
        tmp_data$id[i] <- j
      }
    }
  }
  
  sentiment_hourly <- rbind(sentiment_hourly, tmp_data)
  
  # Company name determination
  which_sym <- which(data$symbols == sym)
  stock_name <- data$company_names[which_sym[1]]
  
  # Sentiments per Hour for given company 
  ggplot(data = tmp_data, aes(x= id, y = sent_round)) +
    geom_point()+
    geom_line()+
    ggtitle(paste("Means of Sentiments of", stock_name))

  ggsave(filename = paste0("means_sentiments_", stock_name,"_id", ".png"))
  
  stock_names_selected <- c(stock_names_selected, stock_name)
  
  
  print(sym)
}


# ==========================
# Consider the data per day 
# ==========================
# __as we could not found hourly stock price data

# _ Only consider preselected data
sentiment_daily <- sentiment_hourly %>% select(time_rounded, 
                                               sent_round, 
                                               symb)

# _ Transform time stamps to daily time stamps 
sentiment_daily$daily <- format(as.POSIXct(sentiment_hourly$time_rounded), '%Y-%m-%d')

# __ Range of the data 
time_max <- which.max(as.POSIXct(sentiment_daily$daily))
time_min <- which.min(as.POSIXct(sentiment_daily$daily))

# __ Sequence of all days in that range
time_stamps_daily <- seq(as.POSIXct(sentiment_daily$daily[time_min]), 
                         as.POSIXct(sentiment_daily$daily[time_max]), 
                         by="day")

# Empty data frame daily 
mean_senti_daily <- data.frame(
  id = character(), 
  sent_round = numeric(), 
  symb = character(), 
  daily = character()
)


# _ Iterate over all choosen companies 
for(sym in symb_with_const){
  
  # Add id for plotting 
  tmp_data <- sentiment_daily %>%
    filter(symb == sym) %>%
    mutate("id" = 0)
  
  for(i in 1:dim(tmp_data)[1]){
    for(j in 1:length(time_stamps_daily)){
      if(tmp_data$daily[i] == time_stamps_daily[j]){
        tmp_data$id[i] <- j
      }
    }
  }
  
  
  # Fill the data frame iterativly 
  mean_sentiments_tmp <- tmp_data %>%
    group_by(id) %>%
    summarise("sent_round" = mean(sent_round))%>%
    mutate("symb" = sym)%>%
    mutate("daily" = unique(tmp_data$daily) )
  
  mean_senti_daily <- rbind(mean_senti_daily, mean_sentiments_tmp)
  
  # Generate Daily sentiment plots 
  ggplot(data = mean_sentiments_tmp, aes(x= id, y = sent_round)) +
    geom_point()+
    geom_line()+
    ggtitle(paste("Means of Sentiments of", sym, " daily consideration"))

  ggsave(filename = paste0("mean_senti_", sym, "_id_daily", ".png"))
  
  
  print(sym)
}


# ==========================================
# STOCK DATA ANALYSIS 
# ==========================================

# How are the sentiments of the tweets and 
# the stock data related

# detete ARRY as no stock data found 
symb_with_const <- symb_with_const[2:length(symb_with_const)]


# Iterate over preselected stocks 
for(sym in symb_with_const){
  
  # Filter sentiment data
  data_sentiment <- mean_senti_daily %>% 
    filter(symb == sym)
  
  # Read in the stock data 
  data_stock <- read.csv(file = paste0(sym, '.csv'))
  
  # _ reformutation of the time 
  data_stock$Date <- as.POSIXct(strptime(data_stock$Date, '%Y-%m-%d'))
  
  indices_stock <- c()
  indices_senti <- c()
  for(i in 1:length(data_stock$Date)){
    for(j in 1:length(data_sentiment$daily)){
      if(data_stock$Date[i] == data_sentiment$daily[j]){
        indices_stock <- c(indices_stock, i)
        indices_senti <- c(indices_senti, j)
      }
    }
  }
  
  if(length(indices_senti ) == length(indices_stock)){
    
    # _ Filter the right data of both data sets 
    
    
    data_sentiment <- data_sentiment %>%
      slice(indices_senti)
    
    data_stock <- data_stock %>%
      slice(min(indices_stock)-1 ,indices_stock)
    
    data_stock_mean <- (data_stock$High + data_stock$Low)/2
    
    data_stock_diff <- c()
    for(i in 1:(length(data_stock_mean)-1)){
      data_stock_diff <- c(data_stock_diff, 
                           data_stock_mean[i+1]- data_stock_mean[i])
    }
    
    data_sentiment$sent_round <- data_sentiment$sent_round * abs(data_stock_diff)
    
    data_conv <- convolve(data_sentiment$sent_round, 
                          data_stock_diff, 
                          type = "o")
    
    shift <- round((length(data_conv)-length(data_stock_diff))/2)
    
    data_conv.df <- as.data.frame(
      matrix(c((1:length(data_conv)), data_conv[1:length(data_conv)]),
             ncol = 2))
    
    data_plot.df <- as.data.frame(
      matrix(c((shift+1):(length(data_stock_diff)+shift), 
               data_stock_diff, 
               data_sentiment$sent_round), 
             ncol = 3))
    
    
     ggplot()+ 
      geom_line(data = data_conv.df, aes(x = V1, y = V2, color = "Convolution"))+
      geom_line(data = data_plot.df, aes(x = V1, y = V2, color = "Stock"))+
      geom_line(data = data_plot.df, aes(x = V1, y = V3, color = "Sentiments"))+
      ggtitle(paste0("Overview of ", sym, " Data"))+
      labs(x = NULL, y = NULL,color = "Legend") + 
      guides(x = "none")
    
    
    ggsave(filename = paste0("Convolution_of_", sym ,"_data.png"))
    
    
    plot_data <- cbind(data_stock %>% slice(-1), data_sentiment)
    plot_data <- plot_data %>% mutate("Mean" = (plot_data$Low + plot_data$High)/2) 
    
    ggplot(data = plot_data, aes(x = Date, y = Mean))+
      geom_line() + 
      geom_point(aes(colour = cut(sent_round, c(-Inf,-0.5, 0.5, Inf))), size = 5)+ 
      scale_color_manual(name = "Sentiment Mean",
                         values = c("(-Inf,-0.5]" = "red",
                                    "(-0.5,0.5]" = "yellow",
                                    "(0.5, Inf]" = "green")
      )+
      ggtitle(paste0("Stock value of ", sym, " with sentiments"))+
      labs(y = "Daily Stock Mean")
      
    ggsave(filename = paste0("Col_Points_Stock_", sym ,"_data.png"))
    
    
  }
  
  print(sym)
}