# Exoplanets confirmation values and habitability

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
df = pd.read_csv("/kaggle/input/phl-exoplanet-catalog/phl_exoplanet_catalog_2019.csv")
df.head()

In [None]:
Null_values=list(zip(df.columns.values.tolist(),df.isnull().sum().tolist()))
Null_values

In [None]:
j=0
for i in df.columns.values.tolist():
    if Null_values[j][1]>=df.shape[0]*0.1225:      
        df = df.drop(columns=i)
    j=j+1

m = np.core.defchararray.find(df.columns.values.astype(str), 'ERROR') >= 0
df=df.loc[:,~m]

df=df.dropna()
df[df != 'nan']

### Correlations

In [None]:
plt.figure(figsize=(16,9))

heatmap = sns.heatmap(df.corr(), vmin=-1,vmax=1, annot=False, cmap='viridis')

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
plt.show()

Here is observed that the P_HABITABLE which confirm the exoplanet as non-habitable(0), conservatively habitable (1), optmistically habitable(2) is strongly correlated to:
* P_HABZONE_OPT
* P_HABZONE_CON
* P_ESI

In the other hand, the predictor components have correlations of 1 in some cases so the information obtained is duplicated. That ones are:

* P_SEMI_MAJOR_AXIS_EST, P_DISTANCE, P_APASTRON, P_DISTANCE_EFF
* P_FLUX, P_FLUX_MAX, P_FLUX_MIN
* P_TEMP_EQUIL, P_TEMP_EQUIL_MAX, P_TEMP_EQUIL_MIN
* S_RA, S_RA_H
* S_RADIUS, S_RADIUS_EST, S_HZ_OPT_MIN, S_HZ_OPT_MAX, S_HZ_CON0_MIN, S_HZ_CON0_MAX, S_HZ_CON1_MIN, S_HZ_CON1_MAX, S_SNOW_LINE

Is worth keeping only one of each group.
Will drop the coordenates of the Star because that doesn't give information of habitability just location:

* S_RA, S_RA_H, S_DEC, S_RA_T, S_DEC_T

In [None]:
df = df.drop(columns=['P_DISTANCE', 'P_APASTRON', 'P_DISTANCE_EFF', 'P_FLUX_MAX', 'P_FLUX_MIN',
                      'P_TEMP_EQUIL_MAX', 'P_TEMP_EQUIL_MIN','S_RA','S_DEC','S_RA_H','S_RA_T','S_DEC_T', 'S_RADIUS_EST', 'S_HZ_OPT_MIN',
                      'S_HZ_OPT_MAX', 'S_HZ_CON_MIN', 'S_HZ_CON_MAX','S_HZ_CON0_MIN', 'S_HZ_CON0_MAX',
                      'S_HZ_CON1_MIN', 'S_HZ_CON1_MAX', 'S_SNOW_LINE'])
df["P_TYPE_TEMP"] = LabelEncoder().fit_transform(df["P_TYPE_TEMP"])
df["P_TYPE"] = LabelEncoder().fit_transform(df["P_TYPE"])
df["S_TYPE_TEMP"] = LabelEncoder().fit_transform(df["S_TYPE_TEMP"])
df["P_DETECTION"] = LabelEncoder().fit_transform(df["P_DETECTION"])
df.head()

In [None]:
plt.figure(figsize=(16,9))

heatmap = sns.heatmap(df.corr(), vmin=-1,vmax=1, annot=False, cmap='viridis')

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
plt.show()

With the known correlations of 'P_HABITABLE' and 'P_ESI' is accurate to say that the most important factors for habitability are:

* S_MAG
* S_LOG_G
* S_TYPE_TEMP // S_TEMPERATURE
* P_TYPE_TEMP // P_TEMP_EQUIL
* P_TYPE

Added to the three previous values the most important factors are some properties of the star and type and temperature of the planet. 

## Visualzation

In [None]:
fig1 = px.density_heatmap(df, x='S_MAG', y='S_LOG_G', z='P_HABITABLE')
fig2 = px.density_heatmap(df,x='P_TYPE',y='P_TYPE_TEMP',z='P_HABITABLE')
fig3 = px.scatter(df,x='P_TEMP_EQUIL',y='P_ESI',color='P_HABITABLE')
fig4 = px.density_heatmap(df,x='P_SEMI_MAJOR_AXIS_EST',y='S_TYPE_TEMP',z='P_HABITABLE')

fig1.show()
fig2.show()
fig3.show()
fig4.show()

## Classification

In [None]:
df.columns.values.tolist()

### Train Test Split

In [None]:
Target = df.P_HABITABLE
Predictors = df.drop(columns=['P_NAME','P_YEAR','P_DETECTION','S_NAME','S_ALT_NAMES','S_CONSTELLATION','S_CONSTELLATION_ABR',
                              'S_CONSTELLATION_ENG','P_UPDATED','P_HABITABLE'])

X_tr1, X_tst1, Y_tr1, Y_tst1 = train_test_split(Predictors,Target, random_state=0)
X_tr2, X_tst2, Y_tr2, Y_tst2 = train_test_split(Predictors,Target, random_state=10)
X_tr3, X_tst3, Y_tr3, Y_tst3 = train_test_split(Predictors,Target, random_state=42)

### Decission Tree Classifier

In [None]:
DTR = DecisionTreeClassifier()

DTR.fit(X_tr1,Y_tr1)
Y_pred1 = DTR.predict(X_tst1)
DTR.fit(X_tr2,Y_tr2)
Y_pred2 = DTR.predict(X_tst2)
DTR.fit(X_tr3,Y_tr3)
Y_pred3 = DTR.predict(X_tst3)

conf_mat1 = confusion_matrix(Y_tst1,Y_pred1, normalize='all')
conf_mat2 = confusion_matrix(Y_tst2,Y_pred2, normalize='all')
conf_mat3 = confusion_matrix(Y_tst3,Y_pred3, normalize='all')

sns.set_style(style='dark')
fig, (ax1,ax2,ax3)=plt.subplots(1,3,figsize=(16,4))
sns.heatmap(ax=ax1, data=conf_mat1, vmin=np.min(conf_mat1.all()),vmax=np.max(conf_mat1), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax1.set_title('Confusion Matrix 1', fontdict={'fontsize':12}, pad=12)
sns.heatmap(ax=ax2,data=conf_mat2, vmin=np.min(conf_mat2.all()),vmax=np.max(conf_mat2), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax2.set_title('Confusion Matrix 2', fontdict={'fontsize':12}, pad=12)
sns.heatmap(ax=ax3,data=conf_mat3, vmin=np.min(conf_mat3.all()),vmax=np.max(conf_mat3), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax3.set_title('Confusion Matrix 3', fontdict={'fontsize':12}, pad=12)

error1= (1-np.diag(conf_mat1).sum())*100
error2= (1-np.diag(conf_mat2).sum())*100
error3= (1-np.diag(conf_mat3).sum())*100
mean_error = np.mean([error1,error2,error3])
print("Errors = ({0:.2f}, {1:.2f}, {2:.2f})%".format(error1,error2,error3))
print("Mean Error = {:.2f} %".format(mean_error))

## Random Forest Classifier

In [None]:
RFC = RandomForestClassifier(n_jobs=2)

RFC.fit(X_tr1,Y_tr1)
Y_pred1 = RFC.predict(X_tst1)
RFC.fit(X_tr2,Y_tr2)
Y_pred2 = RFC.predict(X_tst2)
RFC.fit(X_tr3,Y_tr3)
Y_pred3 = RFC.predict(X_tst3)

conf_mat1 = confusion_matrix(Y_tst1,Y_pred1, normalize='all')
conf_mat2 = confusion_matrix(Y_tst2,Y_pred2, normalize='all')
conf_mat3 = confusion_matrix(Y_tst3,Y_pred3, normalize='all')

sns.set_style(style='dark')
fig, (ax1,ax2,ax3)=plt.subplots(1,3,figsize=(16,4))
sns.heatmap(ax=ax1, data=conf_mat1, vmin=np.min(conf_mat1.all()),vmax=np.max(conf_mat1), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax1.set_title('Confusion Matrix 1', fontdict={'fontsize':12}, pad=12)
sns.heatmap(ax=ax2,data=conf_mat2, vmin=np.min(conf_mat2.all()),vmax=np.max(conf_mat2), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax2.set_title('Confusion Matrix 2', fontdict={'fontsize':12}, pad=12)
sns.heatmap(ax=ax3,data=conf_mat3, vmin=np.min(conf_mat3.all()),vmax=np.max(conf_mat3), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax3.set_title('Confusion Matrix 3', fontdict={'fontsize':12}, pad=12)

error1= (1-np.diag(conf_mat1).sum())*100
error2= (1-np.diag(conf_mat2).sum())*100
error3= (1-np.diag(conf_mat3).sum())*100
mean_error = np.mean([error1,error2,error3])
print("Errors = ({0:.2f}, {1:.2f}, {2:.2f})%".format(error1,error2,error3))
print("Mean Error = {:.2f} %".format(mean_error))

## KNN Classifier

In [None]:
KNNC = KNeighborsClassifier(n_jobs=3)

KNNC.fit(X_tr1,Y_tr1)
Y_pred1 = KNNC.predict(X_tst1)
KNNC.fit(X_tr2,Y_tr2)
Y_pred2 = KNNC.predict(X_tst2)
KNNC.fit(X_tr3,Y_tr3)
Y_pred3 = KNNC.predict(X_tst3)

conf_mat1 = confusion_matrix(Y_tst1,Y_pred1, normalize='all')
conf_mat2 = confusion_matrix(Y_tst2,Y_pred2, normalize='all')
conf_mat3 = confusion_matrix(Y_tst3,Y_pred3, normalize='all')

sns.set_style(style='dark')
fig, (ax1,ax2,ax3)=plt.subplots(1,3,figsize=(16,4))
sns.heatmap(ax=ax1, data=conf_mat1, vmin=np.min(conf_mat1.all()),vmax=np.max(conf_mat1), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax1.set_title('Confusion Matrix 1', fontdict={'fontsize':12}, pad=12)
sns.heatmap(ax=ax2,data=conf_mat2, vmin=np.min(conf_mat2.all()),vmax=np.max(conf_mat2), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax2.set_title('Confusion Matrix 2', fontdict={'fontsize':12}, pad=12)
sns.heatmap(ax=ax3,data=conf_mat3, vmin=np.min(conf_mat3.all()),vmax=np.max(conf_mat3), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax3.set_title('Confusion Matrix 3', fontdict={'fontsize':12}, pad=12)

error1= (1-np.diag(conf_mat1).sum())*100
error2= (1-np.diag(conf_mat2).sum())*100
error3= (1-np.diag(conf_mat3).sum())*100
mean_error = np.mean([error1,error2,error3])
print("Errors = ({0:.2f}, {1:.2f}, {2:.2f})%".format(error1,error2,error3))
print("Mean Error = {:.2f} %".format(mean_error))

## Support Vector Classifier

In [None]:
SVCC = SVC(kernel='linear',C=100)

SVCC.fit(X_tr1,Y_tr1)
Y_pred1 = SVCC.predict(X_tst1)
SVCC.fit(X_tr2,Y_tr2)
Y_pred2 = SVCC.predict(X_tst2)
SVCC.fit(X_tr3,Y_tr3)
Y_pred3 = SVCC.predict(X_tst3)

conf_mat1 = confusion_matrix(Y_tst1,Y_pred1, normalize='all')
conf_mat2 = confusion_matrix(Y_tst2,Y_pred2, normalize='all')
conf_mat3 = confusion_matrix(Y_tst3,Y_pred3, normalize='all')

sns.set_style(style='dark')
fig, (ax1,ax2,ax3)=plt.subplots(1,3,figsize=(16,4))
sns.heatmap(ax=ax1, data=conf_mat1, vmin=np.min(conf_mat1.all()),vmax=np.max(conf_mat1), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax1.set_title('Confusion Matrix 1', fontdict={'fontsize':12}, pad=12)
sns.heatmap(ax=ax2,data=conf_mat2, vmin=np.min(conf_mat2.all()),vmax=np.max(conf_mat2), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax2.set_title('Confusion Matrix 2', fontdict={'fontsize':12}, pad=12)
sns.heatmap(ax=ax3,data=conf_mat3, vmin=np.min(conf_mat3.all()),vmax=np.max(conf_mat3), annot=True,annot_kws={"fontsize":20},cmap='Spectral')
ax3.set_title('Confusion Matrix 3', fontdict={'fontsize':12}, pad=12)

error1= (1-np.diag(conf_mat1).sum())*100
error2= (1-np.diag(conf_mat2).sum())*100
error3= (1-np.diag(conf_mat3).sum())*100
mean_error = np.mean([error1,error2,error3])
print("Errors = ({0:.2f}, {1:.2f}, {2:.2f})%".format(error1,error2,error3))
print("Mean Error = {:.2f} %".format(mean_error))

### Methods and errors given:
* Random Forest Classifier: 0.12 % error
* Support Vector Classifier: 0.20 % error
* Decission Tree Classifier: 0.28 % error
* K Nearest Neighbours Classifier: 0.88 % error