In [1]:
!pip install pymongo
import json

with open('config.json') as config_file:
    config = json.load(config_file)
mongo_uri = config['MONGO_URI']


from pymongo import MongoClient
import pandas as pd
import hashlib


Collecting pymongo
  Downloading pymongo-4.8.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.6.1-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.8.0-cp311-cp311-macosx_10_9_x86_64.whl (645 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m645.6/645.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading dnspython-2.6.1-py3-none-any.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.6.1 pymongo-4.8.0


In [2]:
atlas_client = MongoClient(mongo_uri)
db = atlas_client['stock_db']
collection = db['stocks']
metadata_collection = db['metadata']

In [3]:
def calculate_file_hash(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as file:
        buf = file.read()
        hasher.update(buf)
    return hasher.hexdigest()


In [4]:
def file_already_loaded(file_hash):
    return metadata_collection.find_one({"file_hash": file_hash}) is not None

def save_file_metadata(file_path, file_hash):
    metadata = {
        "file_name": file_path,
        "file_hash": file_hash,
        "loaded_at": pd.Timestamp.now()
    }
    metadata_collection.insert_one(metadata)


In [5]:
def load_csv_to_mongodb(file_path):
    file_hash = calculate_file_hash(file_path)
    if file_already_loaded(file_hash):
        print(f"File {file_path} has already been loaded previously.")
        return
    
    df = pd.read_csv(file_path)
    collection.insert_many(df.to_dict('records'))
    save_file_metadata(file_path, file_hash)
    print(f"Data from the file {file_path} has been loaded successfully.")


In [20]:
## START The Process

In [6]:
csv_path = './all_stocks_5yr.csv'
load_csv_to_mongodb(csv_path)

File ./all_stocks_5yr.csv has already been loaded previously.


In [26]:
def get_data_from_mongodb():
    data = list(collection.find())
    df = pd.DataFrame(data)
    return df

df_mongo = get_data_from_mongodb()
cleaned_df = df_mongo.dropna()
cleaned_df.head()


Unnamed: 0,_id,date,open,high,low,close,volume,Name
0,66972da16ad46174a727aa86,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
1,66972da16ad46174a727aa84,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
2,66972da16ad46174a727aa8e,2013-02-25,13.6,13.76,13.0,13.02,7186400,AAL
3,66972da16ad46174a727aa95,2013-03-06,14.52,14.68,14.25,14.57,13243200,AAL
4,66972da16ad46174a727aa98,2013-03-11,14.85,15.15,14.71,15.13,6961800,AAL
