# DS 203 Project
## Potentially Hazardous Asteroid Detection
_________

### II) Descriptive

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy
from scipy import stats
sns.set(font_scale=1)

In [2]:
df = pd.read_csv("final.csv")
display(df)

Unnamed: 0.1,Unnamed: 0,Neo Reference ID,Absolute Magnitude,Relative Velocity km per sec,Miss Dist.(kilometers),orbit_id,Orbit Uncertainity,Minimum Orbit Intersection,Jupiter Tisserand Invariant,Eccentricity,...,Mean Motion,pha,diameter,albedo,rot_per,ma,class,data_arc,condition_code,rms
0,0,2000433,11.160,4.412295,54688084.0,611,0.0,0.150505,4.583,0.222589,...,0.559880,0.0,25.208245,0.250,5.270,271.071733,AMO,46330.0,0.0,0.28397
1,1,2000719,15.500,,,214,,0.203482,3.140,0.546558,...,0.229955,0.0,,,5.801,140.273422,AMO,39593.0,0.0,0.39148
2,2,2000887,13.800,,,311,,0.082217,3.221,0.570332,...,0.253323,0.0,4.200000,0.310,73.970,294.579167,AMO,37558.0,0.0,0.44039
3,3,2001036,9.400,,,823,,0.344956,3.035,0.533046,...,0.226582,0.0,37.675000,0.238,10.297,4.817744,AMO,35105.0,0.0,0.38018
4,4,2001221,17.700,,,104,,0.107450,3.781,0.435285,...,0.370615,0.0,1.000000,,,38.522411,AMO,32328.0,0.0,0.45839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24631,24631,54097467,21.000,,,2,,0.192704,2.924,0.546396,...,0.240741,0.0,,,,351.405894,AMO,7.0,9.0,0.23911
24632,24632,54097667,23.463,,,1,,0.045436,3.907,0.417971,...,0.421568,0.0,,,,356.724840,AMO,10.0,8.0,0.45779
24633,24633,54097668,25.282,,,1,,0.005352,3.141,0.621031,...,0.247466,0.0,,,,5.162592,APO,1.0,8.0,0.43094
24634,24634,54097669,21.967,,,1,,0.018582,4.201,0.399995,...,0.475470,1.0,,,,344.370686,APO,3.0,9.0,0.26581


* Computing the log-likelihoods for the columns of the dataset with respect to a Gaussian Distribution.

In [3]:
cols = ['Absolute Magnitude', 'Relative Velocity km per sec', 'Miss Dist.(kilometers)', 'Minimum Orbit Intersection',
             'Jupiter Tisserand Invariant', 'Eccentricity', 'Inclination', 'Asc Node Longitude',
             'Orbital Period', 'Perihelion Distance', 'Perihelion Arg', 'Aphelion Dist', 'Mean Motion', 'albedo',
             'rot_per', 'ma', 'data_arc', 'rms', "diameter"]
for col in cols:
    series = df[col].dropna()
    LL = 0                                         # Log-Likelihood
    mean = np.mean(series)
    var = np.var(series)
    dist = scipy.stats.norm(mean, var)
    for value in series:
        L = dist.pdf(value)
        if L!=0:
            LL += np.log(L)
    print('The Log-likelihood of {} is: '.format(col), LL)
    print('-----------------------------------------------')

The Log-likelihood of Absolute Magnitude is:  -77669.67408563258
-----------------------------------------------
The Log-likelihood of Relative Velocity km per sec is:  -18231.42261057408
-----------------------------------------------
The Log-likelihood of Miss Dist.(kilometers) is:  -128251.68237431378
-----------------------------------------------
The Log-likelihood of Minimum Orbit Intersection is:  -1017393.6219126708
-----------------------------------------------
The Log-likelihood of Jupiter Tisserand Invariant is:  -36704.2979152465
-----------------------------------------------
The Log-likelihood of Eccentricity is:  -330858.0253021057
-----------------------------------------------
The Log-likelihood of Inclination is:  -141391.70308878552
-----------------------------------------------
The Log-likelihood of Asc Node Longitude is:  -251335.63042784564
-----------------------------------------------
The Log-likelihood of Orbital Period is:  -497888.820991351
---------------

#### Hypothesis Testing:

$H_0$ : Larger asteroids have the same mean Minimum Orbit Intersection with respect to smaller asteroids

$H_1$ : Larger asteroids have higher Minimum Orbit Intersection (a measure of is an asteroid is hazardous or not)

In [4]:
tdf = df[['Minimum Orbit Intersection', 'diameter']].dropna()
larger = tdf.where(tdf['diameter']>np.mean(tdf['diameter']))['Minimum Orbit Intersection'].dropna()
smaller = tdf.where(tdf['diameter']<=np.mean(tdf['diameter']))['Minimum Orbit Intersection'].dropna()

t_wel, p_wel = stats.ttest_ind(larger,smaller,equal_var = False)
print("Welch's t test:\n----------------")
print("t-value = {}\np-value = {}\n".format(t_wel,p_wel))

l = min(len(larger), len(smaller))
s_wil, p_wil = stats.wilcoxon(larger[:l], smaller[:l])
print("Wilcoxon signed-rank test:\n----------------")
print("statistic = {}\np-value = {}\n".format(s_wil,p_wil))

l_m = np.mean(larger)
s_m = np.mean(smaller)
print('Mean Minimum Orbit Intersection of Larger asteroids = {}'.format(l_m))
print('Mean Minimum Orbit Intersection of Smaller asteroids = {}'.format(s_m))

Welch's t test:
----------------
t-value = 22.213005878371415
p-value = 9.279151940138131e-97

Wilcoxon signed-rank test:
----------------
statistic = 176572.0
p-value = 1.2178575629443109e-72

Mean Minimum Orbit Intersection of Larger asteroids = 0.15321112564914993
Mean Minimum Orbit Intersection of Smaller asteroids = 0.07309112890343995


* From the small p-values from both tests and comparing the mean Minimum Orbit Intersection of asteroids of the two categories, we reject $H_0$ and accept $H_1$. 
______________________

$H_0$ : Asteroids with larger Aphelion Distances have the same mean Minimum Orbit Intersection with respect to asteroids with smaller Aphelion Distances

$H_1$ : Asteroids with larger Aphelion Distances have the greater mean Minimum Orbit Intersection with respect to asteroids with smaller Aphelion Distances

In [5]:
tdf = df[['Minimum Orbit Intersection', 'Aphelion Dist']].dropna()
larger = tdf.where(tdf['Aphelion Dist']>np.mean(tdf['Aphelion Dist']))['Minimum Orbit Intersection'].dropna()
smaller = tdf.where(tdf['Aphelion Dist']<=np.mean(tdf['Aphelion Dist']))['Minimum Orbit Intersection'].dropna()

t_wel, p_wel = stats.ttest_ind(larger,smaller,equal_var = False)
print("Welch's t test:\n----------------")
print("t-value = {}\np-value = {}\n".format(t_wel,p_wel))

l = min(len(larger), len(smaller))
s_wil, p_wil = stats.wilcoxon(larger[:l], smaller[:l])
print("Wilcoxon signed-rank test:\n----------------")
print("statistic = {}\np-value = {}\n".format(s_wil,p_wil))

l_m = np.mean(larger)
s_m = np.mean(smaller)
print('Mean Minimum Orbit Intersection of asteroids with larger Aphelion distances = {}'.format(l_m))
print('Mean Minimum Orbit Intersection of asteroids with smaller Aphelion distances = {}'.format(s_m))

Welch's t test:
----------------
t-value = 25.583838844905568
p-value = 3.1477379298395522e-142

Wilcoxon signed-rank test:
----------------
statistic = 23776740.5
p-value = 5.824351464915653e-107

Mean Minimum Orbit Intersection of asteroids with larger Aphelion distances = 0.11130102440232453
Mean Minimum Orbit Intersection of asteroids with smaller Aphelion distances = 0.07778775213933085


* From the small p-values from both tests and comparing the mean Minimum Orbit Intersection of asteroids of the two categories, we reject $H_0$ and accept $H_1$. 
_________________________

$H_0$ : Asteroids with larger Perhelion Distances have the same mean Minimum Orbit Intersection with respect to asteroids with smaller Perhelion Distances

$H_1$ : Asteroids with larger Perhelion Distances have the greater mean Minimum Orbit Intersection with respect to asteroids with smaller Perhelion Distances

In [6]:
tdf = df[['Minimum Orbit Intersection', 'Perihelion Distance']].dropna()
larger = tdf.where(tdf['Perihelion Distance']>np.mean(tdf['Perihelion Distance']))['Minimum Orbit Intersection'].dropna()
smaller = tdf.where(tdf['Perihelion Distance']<=np.mean(tdf['Perihelion Distance']))['Minimum Orbit Intersection'].dropna()

t_wel, p_wel = stats.ttest_ind(larger,smaller,equal_var = False)
print("Welch's t test:\n----------------")
print("t-value = {}\np-value = {}\n".format(t_wel,p_wel))

l = min(len(larger), len(smaller))
s_wil, p_wil = stats.wilcoxon(larger[:l], smaller[:l])
print("Wilcoxon signed-rank test:\n----------------")
print("statistic = {}\np-value = {}\n".format(s_wil,p_wil))

l_m = np.mean(larger)
s_m = np.mean(smaller)
print('Mean Minimum Orbit Intersection of asteroids with larger Perihelion distances = {}'.format(l_m))
print('Mean Minimum Orbit Intersection of asteroids with smaller Perihelion distances = {}'.format(s_m))

Welch's t test:
----------------
t-value = 51.48049267429175
p-value = 0.0

Wilcoxon signed-rank test:
----------------
statistic = 10734689.0
p-value = 0.0

Mean Minimum Orbit Intersection of asteroids with larger Perihelion distances = 0.11835685615566005
Mean Minimum Orbit Intersection of asteroids with smaller Perihelion distances = 0.05696763587055239


* From the small p-values from both tests and comparing the mean Minimum Orbit Intersection of asteroids of the two categories, we reject $H_0$ and accept $H_1$. 
_________________________

$H_0$ : Asteroids of class AMO same mean Minimum Orbit Intersection with respect to asteroids of class IEO

$H_1$ : Asteroids of class AMO greater Minimum Orbit Intersection with respect to asteroids of class IEO

In [7]:
tdf = df[['Minimum Orbit Intersection', 'class']].dropna()
larger = tdf.where(tdf['class']=='AMO')['Minimum Orbit Intersection'].dropna()
smaller = tdf.where(tdf['class']=='IEO')['Minimum Orbit Intersection'].dropna()

t_wel, p_wel = stats.ttest_ind(larger,smaller,equal_var = False)
print("Welch's t test:\n----------------")
print("t-value = {}\np-value = {}\n".format(t_wel,p_wel))

l = min(len(larger), len(smaller))
s_wil, p_wil = stats.wilcoxon(larger[:l], smaller[:l])
print("Wilcoxon signed-rank test:\n----------------")
print("statistic = {}\np-value = {}\n".format(s_wil,p_wil))

l_m = np.mean(larger)
s_m = np.mean(smaller)
print('Mean Minimum Orbit Intersection of asteroids of class AMO = {}'.format(l_m))
print('Mean Minimum Orbit Intersection of asteroids of class IEO = {}'.format(s_m))

Welch's t test:
----------------
t-value = 2.3046342394523194
p-value = 0.030930615237572737

Wilcoxon signed-rank test:
----------------
statistic = 78.0
p-value = 0.06801654474625561

Mean Minimum Orbit Intersection of asteroids with larger Eccentricities = 0.1655346080786219
Mean Minimum Orbit Intersection of asteroids with smaller Eccentricities = 0.12243791000000002


* Since the p-values from both tests are relatively big, we accept $H_0$. 
_________________________

$H_0$ : Asteroids with orbits with Eccentricity more than 0.5 have the same mean Perihelion Distance compared to those with Eccentricity less than 0.5.

$H_1$ : Asteroids with orbits with Eccentricity more than 0.5 have lesser mean Perihelion Distance than those  with Eccentricity less than 0.5.

In [12]:
tdf = df[['Eccentricity', 'Perihelion Distance']].dropna()
larger = tdf.where(tdf['Eccentricity']<0.5)['Perihelion Distance'].dropna()
smaller = tdf.where(tdf['Eccentricity']>=0.5)['Perihelion Distance'].dropna()

t_wel, p_wel = stats.ttest_ind(larger,smaller,equal_var = False)
print("Welch's t test:\n----------------")
print("t-value = {}\np-value = {}\n".format(t_wel,p_wel))

l = min(len(larger), len(smaller))
s_wil, p_wil = stats.wilcoxon(larger[:l], smaller[:l])
print("Wilcoxon signed-rank test:\n----------------")
print("statistic = {}\np-value = {}\n".format(s_wil,p_wil))

l_m = np.mean(larger)
s_m = np.mean(smaller)
print('Mean Perihelion Distance of asteroids with Eccentricity < 0.5 : {}'.format(l_m))
print('Mean Perihelion Distance of asteroids with Eccentricity >= 0.5 : {}'.format(s_m))

Welch's t test:
----------------
t-value = 43.419064461212415
p-value = 0.0

Wilcoxon signed-rank test:
----------------
statistic = 13563151.0
p-value = 3.9276726167307e-307

Mean Perihelion Distance of asteroids with Eccentricity < 0.5 : 0.9681988803008162
Mean Perihelion Distance of asteroids with Eccentricity >= 0.5 : 0.8337217677038622


* From the small p-values from both tests and comparing the mean Minimum Orbit Intersection of asteroids of the two categories, we reject $H_0$ and accept $H_1$. 
_________________________