<a href="https://colab.research.google.com/github/shreyapande1/NEOData/blob/main/NEO_Earth_Close_Approaches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import plotly.graph_objects as go
import math

In [3]:
# Original Dataset is retrived from Kaggle.com
url = 'https://raw.githubusercontent.com/shreyapande1/NEOData/main/NEO%20Earth%20Close%20Approaches.csv'

df = pd.read_csv(url)

# **Data** **Exploration and Data Cleaning**

In [4]:
# Null Value check
df.isnull().sum()

Object                       0
Close-Approach (CA) Date     0
CA DistanceNominal (au)      0
CA DistanceMinimum (au)      0
V relative(km/s)             0
V infinity(km/s)            22
H(mag)                       9
Diameter                     8
Rarity                       9
dtype: int64

In [5]:
df.dtypes

Object                       object
Close-Approach (CA) Date     object
CA DistanceNominal (au)     float64
CA DistanceMinimum (au)     float64
V relative(km/s)            float64
V infinity(km/s)            float64
H(mag)                      float64
Diameter                     object
Rarity                      float64
dtype: object

Handling the Null values by mean imputation for all numerical columns 

In [6]:
cols = ['V infinity(km/s)', 'H(mag)', 'Rarity']

In [7]:
for col in cols:
    if df[col].isnull().sum() > 0:
        mean_val = df[col].mean()
        df[col].fillna(mean_val, inplace=True)

In [8]:
# because it is a string value
df['Diameter'].fillna("Unknown", inplace=True)

In [9]:
# Checking if null values are covered
df.isnull().sum()

Object                      0
Close-Approach (CA) Date    0
CA DistanceNominal (au)     0
CA DistanceMinimum (au)     0
V relative(km/s)            0
V infinity(km/s)            0
H(mag)                      0
Diameter                    0
Rarity                      0
dtype: int64

In [10]:
df.rename(columns={'CA DistanceNominal (au)':"Nominal_Distance", "CA DistanceMinimum (au)":"Min_Distance", "V relative(km/s)":"V_real", "V infinity(km/s)":"V_inf"},inplace=True)

In [11]:
df.describe(include = 'all')

Unnamed: 0,Object,Close-Approach (CA) Date,Nominal_Distance,Min_Distance,V_real,V_inf,H(mag),Diameter,Rarity
count,35899,35899,35899.0,35899.0,35899.0,35899.0,35899.0,35899,35899.0
unique,13848,35894,,,,,,820,
top,(2022 YG),2021-Oct-06 01:57 ± < 00:01,,,,,,15 m - 34 m,
freq,50,2,,,,,,689,
mean,,,0.027882,0.024017,10.407677,10.384499,24.860669,,0.361354
std,,,0.01384,0.014188,5.393484,5.399292,2.65357,,0.657493
min,,,5e-05,0.0,0.14,0.22,13.9,,0.0
25%,,,0.0167,0.01181,6.59,6.57,23.4,,0.0
50%,,,0.029,0.02385,9.36,9.34,25.3,,0.0
75%,,,0.03979,0.03605,13.16,13.13,26.7,,1.0


In [12]:
neo = df.copy()

In [13]:
neo

Unnamed: 0,Object,Close-Approach (CA) Date,Nominal_Distance,Min_Distance,V_real,V_inf,H(mag),Diameter,Rarity
0,509352 (2007 AG),1900-Jan-04 22:25 ± 00:02,0.00963,0.00962,8.69,8.65,20.2,250 m - 550 m,2.0
1,(2014 SC324),1900-Jan-11 01:07 ± 00:18,0.03997,0.03991,10.65,10.65,24.3,37 m - 82 m,0.0
2,(2012 UK171),1900-Jan-12 23:07 ± 00:13,0.04982,0.04950,7.16,7.15,24.4,34 m - 77 m,0.0
3,4660 Nereus (1982 DB),1900-Jan-29 18:10 ± 00:35,0.02080,0.02077,5.54,5.52,18.6,0.33±0.05 km,2.0
4,(2015 RW83),1900-Feb-04 03:50 ± 14:49,0.03312,0.03258,3.14,3.12,24.1,40 m - 90 m,0.0
...,...,...,...,...,...,...,...,...,...
35894,(2017 UH5),2200-Oct-29 07:16 ± 00:02,0.04891,0.04890,6.60,6.59,26.6,13 m - 28 m,0.0
35895,475534 (2006 TS7),2200-Nov-01 07:43 ± < 00:01,0.01148,0.01148,17.73,17.72,21.3,150 m - 330 m,2.0
35896,413577 (2005 UL5),2200-Nov-23 02:10 ± 00:09,0.01482,0.01457,18.17,18.16,20.3,230 m - 510 m,2.0
35897,(2014 WT202),2200-Nov-23 03:36 ± 00:04,0.04197,0.04170,12.00,11.99,21.1,160 m - 360 m,1.0


In [14]:
# Unicoding the data
import unicodedata

In [15]:
results = []

for vals in neo['Close-Approach (CA) Date']:
    x = vals.split('±')[0]
    result = unicodedata.normalize('NFKD', x)
    results.append(result)

neo['Closest Date'] = pd.to_datetime(results)
neo['year'] = pd.DatetimeIndex(neo['Closest Date']).year
neo['month'] = pd.DatetimeIndex(neo['Closest Date']).month
neo['date'] = pd.DatetimeIndex(neo['Closest Date']).day
neo['time'] = pd.DatetimeIndex(neo['Closest Date']).time

In [16]:
sample_data = neo.copy()
year = []
month= []
date = []
time = []
for vals in sample_data['Close-Approach (CA) Date']:
    year_val = vals[0:4]
    month_val = vals[5:8]
    date_val = vals[9:11]
    time_val = vals[12:17]
    year.append(year_val)
    month.append(month_val)
    date.append(date_val)
    time.append(time_val)
    
    
sample_data['Year_Close_Approach'] = year
sample_data['Month_Close_Approach'] = month
sample_data['Date_Close_Approach'] = date
sample_data['Time_Close_Approach'] = time

In [17]:
sample_data.head()

Unnamed: 0,Object,Close-Approach (CA) Date,Nominal_Distance,Min_Distance,V_real,V_inf,H(mag),Diameter,Rarity,Closest Date,year,month,date,time,Year_Close_Approach,Month_Close_Approach,Date_Close_Approach,Time_Close_Approach
0,509352 (2007 AG),1900-Jan-04 22:25 ± 00:02,0.00963,0.00962,8.69,8.65,20.2,250 m - 550 m,2.0,1900-01-04 22:25:00,1900,1,4,22:25:00,1900,Jan,4,22:25
1,(2014 SC324),1900-Jan-11 01:07 ± 00:18,0.03997,0.03991,10.65,10.65,24.3,37 m - 82 m,0.0,1900-01-11 01:07:00,1900,1,11,01:07:00,1900,Jan,11,01:07
2,(2012 UK171),1900-Jan-12 23:07 ± 00:13,0.04982,0.0495,7.16,7.15,24.4,34 m - 77 m,0.0,1900-01-12 23:07:00,1900,1,12,23:07:00,1900,Jan,12,23:07
3,4660 Nereus (1982 DB),1900-Jan-29 18:10 ± 00:35,0.0208,0.02077,5.54,5.52,18.6,0.33±0.05 km,2.0,1900-01-29 18:10:00,1900,1,29,18:10:00,1900,Jan,29,18:10
4,(2015 RW83),1900-Feb-04 03:50 ± 14:49,0.03312,0.03258,3.14,3.12,24.1,40 m - 90 m,0.0,1900-02-04 03:50:00,1900,2,4,03:50:00,1900,Feb,4,03:50


# **Visualizing** **our** **data**

In [18]:
import plotly.express as px 

**Visualization** **1**

In [19]:
count_date = px.histogram(neo, x ='year')
count_date.show()

**Visualization 2**

In [20]:
count_box = px.bar(pd.DataFrame(neo.Object.value_counts().reset_index())[0:10], x ='index',y='Object',color_discrete_sequence=['blue']
                  ,labels = {'Object':"Count of the object",
                            "index":"Near-Earth Object"},
                  title = 'Top 10 near-earth objects')

count_box.show()

**Visualization 3**

In [40]:
neo_no_na = sample_data.dropna()

In [53]:
neo_top20 = pd.DataFrame(neo_no_na.Object.value_counts().reset_index())[0:20]

In [55]:
#neo_top20

In [56]:
neo_top20 = neo_top10.rename({'Object': 'count'}, axis=1)
neo_top20 = neo_top10.rename({'index': 'Object'}, axis=1)

In [57]:
neo_todisplay = pd.merge(neo_no_na, neo_top20, on = 'Object', how = 'inner')
neo_todisplay = neo_todisplay.sort_values(by = ['year'], ascending = [True], na_position = 'first')
neo_todisplay.head()

Unnamed: 0,Object,Close-Approach (CA) Date,Nominal_Distance,Min_Distance,V_real,V_inf,H(mag),Diameter,Rarity,Closest Date,year,month,date,time,Year_Close_Approach,Month_Close_Approach,Date_Close_Approach,Time_Close_Approach,count
0,(2019 AY3),1901-May-22 13:42 ± < 00:01,0.03187,0.03186,18.76,18.76,23.8,46 m - 100 m,0.0,1901-05-22 13:42:00,1901,5,22,13:42:00,1901,May,22,13:42,31
69,(2009 WY7),1902-Dec-02 22:39 ± 01:54,0.01728,0.01725,12.93,12.92,24.1,40 m - 90 m,1.0,1902-12-02 22:39:00,1902,12,2,22:39:00,1902,Dec,2,22:39,32
31,(2010 VQ),1902-Nov-17 20:07 ± 01:28,0.04327,0.04324,3.91,3.9,27.7,7.6 m - 17 m,0.0,1902-11-17 20:07:00,1902,11,17,20:07:00,1902,Nov,17,20:07,38
70,(2009 WY7),1903-Nov-03 22:37 ± 02:53,0.03226,0.03189,13.34,13.33,24.1,40 m - 90 m,0.0,1903-11-03 22:37:00,1903,11,3,22:37:00,1903,Nov,3,22:37,32
1,(2019 AY3),1905-May-22 05:16 ± < 00:01,0.03518,0.03518,19.06,19.05,23.8,46 m - 100 m,0.0,1905-05-22 05:16:00,1905,5,22,05:16:00,1905,May,22,05:16,31


In [59]:
fig = px.scatter(neo_todisplay,x ='Nominal_Distance',y='year',animation_frame="year", animation_group="year", range_x=[-0.01,0.1],range_y=[1900,2200],color = 'Object')
fig.add_vline(x = 0)
fig.add_annotation(
    x=0
    , text=f'Earth<br>Center'
    , yanchor='bottom'
    , showarrow=True
    , arrowhead=1
    , arrowsize=1
    , arrowwidth=2
    , arrowcolor="#636363"
    , ax=-20
    , ay=-30
    , font=dict(size=20, color="green", family="Courier New, monospace")
    , align="left"
    , bordercolor='green'
    , borderwidth=2
    , bgcolor="#CFECEC"
    , opacity=0.8
    ,)
fig.show()