In [1]:
#Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
def process_and_output_csvs(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    all_data = []
    
    # Iterate through each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            file_path = os.path.join(input_folder, filename)
            try:
                # Read the CSV file and append to the list
                data = pd.read_csv(file_path)
                all_data.append(data)
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
     # Check if all_data is not empty
    if not all_data:
        print("No data to process. Check if the folder contains CSV files or if they are readable.")
        return

    # Concatenate all dataframes
    concatenated_df = pd.concat(all_data, ignore_index=True)
    
    # Drop duplicate rows
    concatenated_df.drop_duplicates(inplace=True)

    # Group by subreddit and create/update separate files
    for subreddit, group in concatenated_df.groupby('subreddit'):
        output_file = os.path.join(output_folder, f"{subreddit}.csv")
        if os.path.exists(output_file):
            # Read existing file, drop duplicates with the new data, and then append
            existing_data = pd.read_csv(output_file)
            combined_data = pd.concat([existing_data, group]).drop_duplicates()
            combined_data.to_csv(output_file, index=False)
        else:
            # If file doesn't exist, write with header
            group.to_csv(output_file, index=False)

In [3]:
input_folder = '../collected_data/data'
output_folder = '../EDA/data/'
process_and_output_csvs(input_folder, output_folder)

## Stock Statistics

In [4]:
stocks = pd.read_csv('../EDA/data/stocks.csv')

In [5]:
stocks.head(2)

Unnamed: 0,id,subreddit,title,posttext,time_posted,time_now,time_difference,num_comments,upvote_ratio,url
0,18888ww,stocks,Rate My Portfolio - r/Stocks Quarterly Thread ...,Please use this thread to discuss your portfol...,2023-12-01 10:00:24,2024-02-28 22:08:57.699800,89 days 12:08:33.699801,621,0.99,https://www.reddit.com/r/stocks/comments/18888...
1,1b22xgu,stocks,"r/Stocks Daily Discussion Wednesday - Feb 28, ...",These daily discussions run from Monday to Fri...,2024-02-28 10:30:09,2024-02-28 22:08:57.699812,0 days 11:38:48.699813,387,0.88,https://www.reddit.com/r/stocks/comments/1b22x...


In [6]:
stocks.shape

(6082, 10)

In [7]:
stocks.dtypes

id                  object
subreddit           object
title               object
posttext            object
time_posted         object
time_now            object
time_difference     object
num_comments         int64
upvote_ratio       float64
url                 object
dtype: object

## Investing

In [8]:
investing = pd.read_csv('../EDA/data/investing.csv')

In [9]:
investing.shape

(13753, 10)

In [10]:
investing.head(5)

Unnamed: 0,id,subreddit,title,posttext,time_posted,time_now,time_difference,num_comments,upvote_ratio,url
0,1axwwm1,investing,Daily General Discussion and Advice Thread - F...,Have a general question? Want to offer some c...,2024-02-23 10:01:08,2024-02-24 00:37:44.155873,0 days 14:36:36.155876,,,
1,1ay852f,investing,Do all stocks go up because one stock goes up?,"NVDA is a $2 trillion company, which is the th...",2024-02-23 18:40:11,2024-02-24 00:37:44.155886,0 days 05:57:33.155886,,,
2,1ay5t1j,investing,Anyone looking to live just off dividends and ...,Planning to work as long as I can to build up ...,2024-02-23 17:10:12,2024-02-24 00:37:44.155890,0 days 07:27:32.155891,,,
3,1ayclmn,investing,Should I dump them or hold?,Have two very bad pick stocks (don't we all?) ...,2024-02-23 21:41:08,2024-02-24 00:37:44.155895,0 days 02:56:36.155895,,,
4,1axo37b,investing,"For people that rode the NVDA pump ""all the wa...",NVDA is blowing my mind. I'm certainly more of...,2024-02-23 01:32:09,2024-02-24 00:37:44.155900,0 days 23:05:35.155900,,,


In [11]:
master_df = pd.concat([stocks, investing])
print(f'Master DF: {master_df.shape},\n Stocks: {stocks.shape}, \n Investing: {investing.shape}')


Master DF: (19835, 10),
 Stocks: (6082, 10), 
 Investing: (13753, 10)


In [12]:
#creating master dataframe

In [13]:
#output to csv
master_df.to_csv('../EDA/data/master_df.csv', index = False)