In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

import numpy as np
import sys
from dotenv import load_dotenv
import os
load_dotenv()
sys.path.append(os.getenv("ROOT"))

from connector.cassandra_connector import get_session, create_and_set_keyspace
session = get_session()
create_and_set_keyspace(session)

pd.set_option("display.max_columns", None)

feature_pool = ['id', 'absolute_magnitude_h', 'close_approach_date', 'epoch_date_close_approach', 'estimated_diameter_max', 'estimated_diameter_min', 
     'is_potentially_hazardous_asteroid', 'miss_distance_lunar', 'relative_velocity']

select_query = f"SELECT {', '.join(feature_pool)} FROM proximity_table"
df = pd.DataFrame(session.execute(select_query))


In [2]:
df['close_approach_date'] = pd.to_datetime(df['close_approach_date'], format='%Y-%m-%d')

In [3]:
daily_counts = df.groupby([df['close_approach_date'].dt.to_period("D")]).size()

In [4]:
daily_counts = daily_counts.sort_index(ascending=False)
daily_counts = daily_counts.reset_index()
daily_counts = daily_counts.rename(columns={'index': 'close_approach_date', 0: 'count'})
daily_counts['close_approach_date'] = pd.to_datetime(daily_counts['close_approach_date'].astype(str), format='%Y-%m-%d')

In [5]:
df = df.sort_values(by='close_approach_date', ascending=False)

In [6]:
df = pd.merge(df, daily_counts, on='close_approach_date', how='left')

In [7]:
df

Unnamed: 0,id,absolute_magnitude_h,close_approach_date,epoch_date_close_approach,estimated_diameter_max,estimated_diameter_min,is_potentially_hazardous_asteroid,miss_distance_lunar,relative_velocity,count
0,54416841,23.336,2023-12-15,1702644360000,127.890999,57.194593,False,44.6371284947,21227.475006,7
1,54415301,24.251,2023-12-15,1702611600000,83.915073,37.527962,False,67.2630463295,11927.751740,7
2,54414587,24.585,2023-12-15,1702635900000,71.951521,32.177698,False,40.368278555,6419.140202,7
3,54414144,25.044,2023-12-15,1702648920000,58.242496,26.046836,False,9.6105848693,13379.769912,7
4,2137671,18.710,2023-12-15,1702627920000,1076.564312,481.454197,False,88.2889503703,11028.422196,7
...,...,...,...,...,...,...,...,...,...,...
26797,3102728,20.940,2014-12-26,1419567780000,385.513844,172.407032,True,129.1726759038,15672.330037,14
26798,2442177,19.940,2014-12-26,1419572040000,610.998267,273.246732,False,186.2784383978,21920.617348,14
26799,2416002,20.480,2014-12-26,1419563640000,476.474846,213.086029,False,180.1899317269,16491.835965,14
26800,3678562,21.020,2014-12-26,1419588720000,371.569432,166.170902,False,141.8268580017,18068.040287,14


In [8]:
df.set_index('close_approach_date', inplace=True)
df = df.sort_index()

X = df.drop('count', axis=1)
y = df['count']

cutoff = '2023-10-01'
X_train = X[:cutoff]
X_test = X[cutoff:]
y_train = y[:cutoff]
y_test = y[cutoff:]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
df

Unnamed: 0_level_0,id,absolute_magnitude_h,epoch_date_close_approach,estimated_diameter_max,estimated_diameter_min,is_potentially_hazardous_asteroid,miss_distance_lunar,relative_velocity,count
close_approach_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-12-26,54356031,24.050,1419581400000,92.053431,41.167546,False,194.2647351729,17985.932099,14
2014-12-26,3704151,24.700,1419624480000,68.240151,30.517923,False,11.8034325238,7777.683519,14
2014-12-26,2417419,20.920,1419569400000,389.080960,174.002295,True,117.0719300576,6572.372629,14
2014-12-26,3703011,26.100,1419587520000,35.812940,16.016034,False,0.908685717,15136.435783,14
2014-12-26,2523747,18.820,1419552480000,1023.387195,457.672667,False,143.7248348648,16899.735691,14
...,...,...,...,...,...,...,...,...,...
2023-12-15,2137671,18.710,1702627920000,1076.564312,481.454197,False,88.2889503703,11028.422196,7
2023-12-15,54414144,25.044,1702648920000,58.242496,26.046836,False,9.6105848693,13379.769912,7
2023-12-15,54414587,24.585,1702635900000,71.951521,32.177698,False,40.368278555,6419.140202,7
2023-12-15,54415301,24.251,1702611600000,83.915073,37.527962,False,67.2630463295,11927.751740,7
