In [1]:
# Import dependencies:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#from imblearn.ensemble import BalancedRandomForestClassifier
#from sklearn.metrics import balanced_accuracy_score
#from imblearn.metrics import classification_report_imbalanced
#from imblearn.ensemble import EasyEnsembleClassifier

In [2]:
# Load the data:
file_path = Path("../data/CLEAN_KEPLER_DATASET.csv")
exoplanet_df = pd.read_csv(file_path)


print(exoplanet_df.shape)
exoplanet_df.head()

(9564, 141)


Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,8/16/2018,CANDIDATE,1.0,0,...,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,8/16/2018,CANDIDATE,0.969,0,...,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,3,10811496,K00753.01,,CANDIDATE,Done,8/16/2018,CANDIDATE,0.0,0,...,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,4,10848459,K00754.01,,FALSE POSITIVE,Done,8/16/2018,FALSE POSITIVE,0.0,0,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,8/16/2018,CANDIDATE,1.0,0,...,-0.09,0.18,0.1,0.14,0.07,0.18,0.02,0.16,0.07,0.2


In [3]:
# Count distinct values in "koi_disposition" column:
exoplanet_df['koi_disposition'].value_counts()

FALSE POSITIVE    4839
CONFIRMED         2669
CANDIDATE         2056
Name: koi_disposition, dtype: int64

In [4]:
# Create a new dataframe by selecting desired columns also set the "rowid" column as index:
new_exoplanet_df = exoplanet_df[["rowid", "kepoi_name", "koi_count", "koi_prad",
                                 "koi_teq", "koi_period", "koi_srad", 
                                 "koi_steff", "koi_smass", "koi_slogg", "koi_ror", "koi_disposition"]].set_index('rowid')


print(new_exoplanet_df.shape)
new_exoplanet_df.index.name = None
new_exoplanet_df.head()

(9564, 11)


Unnamed: 0,kepoi_name,koi_count,koi_prad,koi_teq,koi_period,koi_srad,koi_steff,koi_smass,koi_slogg,koi_ror,koi_disposition
1,K00752.01,2,2.26,793.0,9.488036,0.927,5455.0,0.919,4.467,0.022344,CONFIRMED
2,K00752.02,2,2.83,443.0,54.418383,0.927,5455.0,0.919,4.467,0.027954,CONFIRMED
3,K00753.01,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,CANDIDATE
4,K00754.01,1,33.46,1395.0,1.736952,0.791,5805.0,0.836,4.564,0.387394,FALSE POSITIVE
5,K00755.01,1,2.75,1406.0,2.525592,1.046,6031.0,1.095,4.438,0.024064,CONFIRMED


In [5]:
# Dropna from new dataframe:
new_exoplanet_df= new_exoplanet_df.dropna()
print(new_exoplanet_df.shape)
new_exoplanet_df.head()

(9201, 11)


Unnamed: 0,kepoi_name,koi_count,koi_prad,koi_teq,koi_period,koi_srad,koi_steff,koi_smass,koi_slogg,koi_ror,koi_disposition
1,K00752.01,2,2.26,793.0,9.488036,0.927,5455.0,0.919,4.467,0.022344,CONFIRMED
2,K00752.02,2,2.83,443.0,54.418383,0.927,5455.0,0.919,4.467,0.027954,CONFIRMED
3,K00753.01,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,CANDIDATE
4,K00754.01,1,33.46,1395.0,1.736952,0.791,5805.0,0.836,4.564,0.387394,FALSE POSITIVE
5,K00755.01,1,2.75,1406.0,2.525592,1.046,6031.0,1.095,4.438,0.024064,CONFIRMED


In [6]:
# Drop duplicates:
clean_exoplanet_df = new_exoplanet_df.drop_duplicates()
print(clean_exoplanet_df.shape)
clean_exoplanet_df.head()

(9201, 11)


Unnamed: 0,kepoi_name,koi_count,koi_prad,koi_teq,koi_period,koi_srad,koi_steff,koi_smass,koi_slogg,koi_ror,koi_disposition
1,K00752.01,2,2.26,793.0,9.488036,0.927,5455.0,0.919,4.467,0.022344,CONFIRMED
2,K00752.02,2,2.83,443.0,54.418383,0.927,5455.0,0.919,4.467,0.027954,CONFIRMED
3,K00753.01,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,CANDIDATE
4,K00754.01,1,33.46,1395.0,1.736952,0.791,5805.0,0.836,4.564,0.387394,FALSE POSITIVE
5,K00755.01,1,2.75,1406.0,2.525592,1.046,6031.0,1.095,4.438,0.024064,CONFIRMED


In [7]:
# Count distinct values in "koi_disposition" column:
clean_exoplanet_df['koi_disposition'].value_counts()

FALSE POSITIVE    4582
CONFIRMED         2668
CANDIDATE         1951
Name: koi_disposition, dtype: int64

In [8]:
# Give columns proper names:

#clean_exoplanet_df.columns = ['Kepler(OI) Name', 'Number of Planets', 'Planet Radius (Earth)',
#                              'Equalibrium Temp (K)', 'Orbit Period (Days)','Star Radius (Sun)',
#                              'Star Temp (K)', 'Star Mass (Sun)', 'Star Surface Gravity', 'Planet-Star Radius Ratio', 'Status']

# Change column names for ETL:

clean_exoplanet_df.columns = ['Kepler_OI_Name', 'Number_of_Planets', 'Planet_Radius_Earth',
                              'Equalibrium_Temp_K', 'Orbit_Period_Days','Star_Radius_Sun',
                              'Star_Temp_K', 'Star_Mass_Sun', 'Star_Surface_Gravity', 'Planet_Star_Radius_Ratio', 'Status']

# Stellar Surface Gravity (log10(cm s-2)
# The base-10 logarithm of the acceleration due to gravity at the surface of the star.

clean_exoplanet_df.head()

Unnamed: 0,Kepler_OI_Name,Number_of_Planets,Planet_Radius_Earth,Equalibrium_Temp_K,Orbit_Period_Days,Star_Radius_Sun,Star_Temp_K,Star_Mass_Sun,Star_Surface_Gravity,Planet_Star_Radius_Ratio,Status
1,K00752.01,2,2.26,793.0,9.488036,0.927,5455.0,0.919,4.467,0.022344,CONFIRMED
2,K00752.02,2,2.83,443.0,54.418383,0.927,5455.0,0.919,4.467,0.027954,CONFIRMED
3,K00753.01,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,CANDIDATE
4,K00754.01,1,33.46,1395.0,1.736952,0.791,5805.0,0.836,4.564,0.387394,FALSE POSITIVE
5,K00755.01,1,2.75,1406.0,2.525592,1.046,6031.0,1.095,4.438,0.024064,CONFIRMED


In [9]:
# Replace values in 'Status' with 0 , 1 , or 2:
nm_value_df = clean_exoplanet_df.replace({'CONFIRMED':1, 'FALSE POSITIVE':0, 'CANDIDATE':2})
print(nm_value_df.shape)
nm_value_df.head()

(9201, 11)


Unnamed: 0,Kepler_OI_Name,Number_of_Planets,Planet_Radius_Earth,Equalibrium_Temp_K,Orbit_Period_Days,Star_Radius_Sun,Star_Temp_K,Star_Mass_Sun,Star_Surface_Gravity,Planet_Star_Radius_Ratio,Status
1,K00752.01,2,2.26,793.0,9.488036,0.927,5455.0,0.919,4.467,0.022344,1
2,K00752.02,2,2.83,443.0,54.418383,0.927,5455.0,0.919,4.467,0.027954,1
3,K00753.01,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,2
4,K00754.01,1,33.46,1395.0,1.736952,0.791,5805.0,0.836,4.564,0.387394,0
5,K00755.01,1,2.75,1406.0,2.525592,1.046,6031.0,1.095,4.438,0.024064,1


In [10]:
# Check values in "Status" column:
nm_value_df['Status'].value_counts()

0    4582
1    2668
2    1951
Name: Status, dtype: int64

## Upload "nm_value_df" to S3 database as .csv:

In [11]:
# Export the Dataframe as a new CSV file without the index.
nm_value_df.to_csv("../data/nm_value_df.csv", index=False)

In [12]:
import boto3

import getpass

#Creating Session With Boto3:
session = boto3.Session(
aws_access_key_id= getpass.getpass('Enter aws_access_key_id'),
aws_secret_access_key= getpass.getpass('Enter aws_secret_access_key')
)

#Creating S3 Resource From the Session:
s3 = session.resource('s3')

# Upload file:

result = s3.Bucket('testbucketps42').upload_file('../data/nm_value_df.csv', 'nm_value_df.csv')

print(result)

Enter aws_access_key_id········
Enter aws_secret_access_key········
None


In [14]:
# Load the data from S3 bucket:
file_path = "https://testbucketps42.s3.us-east-2.amazonaws.com/nm_value_df.csv"
nm_value_df = pd.read_csv(file_path)
print(nm_value_df.shape)
nm_value_df.head()

(9201, 11)


Unnamed: 0,Kepler_OI_Name,Number_of_Planets,Planet_Radius_Earth,Equalibrium_Temp_K,Orbit_Period_Days,Star_Radius_Sun,Star_Temp_K,Star_Mass_Sun,Star_Surface_Gravity,Planet_Star_Radius_Ratio,Status
0,K00752.01,2,2.26,793.0,9.488036,0.927,5455.0,0.919,4.467,0.022344,1
1,K00752.02,2,2.83,443.0,54.418383,0.927,5455.0,0.919,4.467,0.027954,1
2,K00753.01,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,2
3,K00754.01,1,33.46,1395.0,1.736952,0.791,5805.0,0.836,4.564,0.387394,0
4,K00755.01,1,2.75,1406.0,2.525592,1.046,6031.0,1.095,4.438,0.024064,1


In [15]:
# Check data types in columns:
nm_value_df.dtypes

Kepler_OI_Name               object
Number_of_Planets             int64
Planet_Radius_Earth         float64
Equalibrium_Temp_K          float64
Orbit_Period_Days           float64
Star_Radius_Sun             float64
Star_Temp_K                 float64
Star_Mass_Sun               float64
Star_Surface_Gravity        float64
Planet_Star_Radius_Ratio    float64
Status                        int64
dtype: object

In [16]:
# Create a new DataFrame that holds only the Kepler Object of Intrest (OI) names:
names_df = nm_value_df[["Kepler_OI_Name"]]
print(names_df.shape)
names_df.head(10)

(9201, 1)


Unnamed: 0,Kepler_OI_Name
0,K00752.01
1,K00752.02
2,K00753.01
3,K00754.01
4,K00755.01
5,K00756.01
6,K00756.02
7,K00756.03
8,K00114.01
9,K00757.01


In [17]:
# Drop the 'Kepler Name (OI)' column since it's not going to be used for model:
nm_value_df = nm_value_df.drop(columns=["Kepler_OI_Name"])
print(nm_value_df.shape)
nm_value_df.head(10)

(9201, 10)


Unnamed: 0,Number_of_Planets,Planet_Radius_Earth,Equalibrium_Temp_K,Orbit_Period_Days,Star_Radius_Sun,Star_Temp_K,Star_Mass_Sun,Star_Surface_Gravity,Planet_Star_Radius_Ratio,Status
0,2,2.26,793.0,9.488036,0.927,5455.0,0.919,4.467,0.022344,1
1,2,2.83,443.0,54.418383,0.927,5455.0,0.919,4.467,0.027954,1
2,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,2
3,1,33.46,1395.0,1.736952,0.791,5805.0,0.836,4.564,0.387394,0
4,1,2.75,1406.0,2.525592,1.046,6031.0,1.095,4.438,0.024064,1
5,3,3.9,835.0,11.094321,0.972,6046.0,1.053,4.486,0.036779,1
6,3,2.77,1160.0,4.134435,0.972,6046.0,1.053,4.486,0.026133,1
7,3,1.59,1360.0,2.566589,0.972,6046.0,1.053,4.486,0.014983,1
8,1,39.21,1342.0,7.36179,1.958,6227.0,1.358,3.986,0.183387,0
9,3,5.76,600.0,16.068647,0.848,5031.0,0.801,4.485,0.062161,1


# Separate Candidate planets from Confirmed and False Positives:

In [18]:
# Create a DF for Verified Planets
verified_planets_df = nm_value_df.drop(nm_value_df.index[nm_value_df['Status'] == 2])
print(verified_planets_df.shape)
verified_planets_df.head()

(7250, 10)


Unnamed: 0,Number_of_Planets,Planet_Radius_Earth,Equalibrium_Temp_K,Orbit_Period_Days,Star_Radius_Sun,Star_Temp_K,Star_Mass_Sun,Star_Surface_Gravity,Planet_Star_Radius_Ratio,Status
0,2,2.26,793.0,9.488036,0.927,5455.0,0.919,4.467,0.022344,1
1,2,2.83,443.0,54.418383,0.927,5455.0,0.919,4.467,0.027954,1
3,1,33.46,1395.0,1.736952,0.791,5805.0,0.836,4.564,0.387394,0
4,1,2.75,1406.0,2.525592,1.046,6031.0,1.095,4.438,0.024064,1
5,3,3.9,835.0,11.094321,0.972,6046.0,1.053,4.486,0.036779,1


In [19]:
# Check values in "Status" column:
verified_planets_df['Status'].value_counts()

0    4582
1    2668
Name: Status, dtype: int64

In [20]:
# Create a DF for Unverified Planets
unverified_planets_df = nm_value_df.drop(nm_value_df.index[nm_value_df['Status'].isin([1, 0])])
print(unverified_planets_df.shape)
unverified_planets_df.head()

(1951, 10)


Unnamed: 0,Number_of_Planets,Planet_Radius_Earth,Equalibrium_Temp_K,Orbit_Period_Days,Star_Radius_Sun,Star_Temp_K,Star_Mass_Sun,Star_Surface_Gravity,Planet_Star_Radius_Ratio,Status
2,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,2
37,1,12.21,1103.0,4.959319,1.082,5712.0,0.976,4.359,0.103379,2
58,1,7.51,467.0,40.419504,0.781,5446.0,0.714,4.507,0.088069,2
62,2,19.45,734.0,7.240661,0.765,5005.0,0.85,4.595,0.232818,2
63,3,0.55,1272.0,3.435916,1.087,5779.0,0.941,4.339,0.004612,2


In [21]:
# Check values in "Status" column:
unverified_planets_df['Status'].value_counts()

2    1951
Name: Status, dtype: int64

In [22]:
# Check data types in columns:
verified_planets_df.dtypes

Number_of_Planets             int64
Planet_Radius_Earth         float64
Equalibrium_Temp_K          float64
Orbit_Period_Days           float64
Star_Radius_Sun             float64
Star_Temp_K                 float64
Star_Mass_Sun               float64
Star_Surface_Gravity        float64
Planet_Star_Radius_Ratio    float64
Status                        int64
dtype: object

In [23]:
# Check data types in columns:
unverified_planets_df.dtypes

Number_of_Planets             int64
Planet_Radius_Earth         float64
Equalibrium_Temp_K          float64
Orbit_Period_Days           float64
Star_Radius_Sun             float64
Star_Temp_K                 float64
Star_Mass_Sun               float64
Star_Surface_Gravity        float64
Planet_Star_Radius_Ratio    float64
Status                        int64
dtype: object

# Mockup/ Draft Model
# Split the Data into Training and Testing

### Create a Supervised Machine Learning model, use Logistic Regression // classification 1 or 0.

In [24]:
# Create our features:
X = verified_planets_df.drop("Status", axis=1)

# Create our target:
y = verified_planets_df["Status"]

In [25]:
# Split into Train and Test Sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y,  test_size= 0.33)

In [26]:
# Train the model:
lm = LogisticRegression(solver='lbfgs',
                                max_iter=300,
                                random_state=1)
lm.fit(X_train, y_train)

LogisticRegression(max_iter=300, random_state=1)

In [27]:
# Validate the model:
from sklearn.metrics import classification_report

predict = lm.predict(X_test)

# Print all results:
with np.printoptions(threshold=np.inf):
    print(predict)

[0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 1
 0 0 1 1 1 0 0 1 0 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 0 1 0 1 0
 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 1 1 0 1 0 1 0
 0 0 1 0 1 0 0 0 0 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 0
 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 1
 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 1
 0 0 1 1 1 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0
 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0
 0 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 1 0
 1 1 0 0 1 0 0 0 1 0 1 1 0 1 0 1 0 1 0 1 0 1 1 1 0 0 1 1 1 0 0 0 1 1 1 0 0
 0 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1 1 1 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1
 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0
 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 0 0 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0
 1 0 0 1 0 1 0 1 0 0 0 0 

In [28]:
# DF for output:
t1 = pd.DataFrame({"Prediction": predict, "Actual": y_test})
t1.head(10)

Unnamed: 0,Prediction,Actual
4457,0,0
1668,1,1
1795,1,1
7259,0,0
2523,0,0
4958,0,0
4405,0,0
1117,0,0
1924,1,1
2601,1,0


In [29]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      1512
           1       0.77      0.83      0.80       881

    accuracy                           0.85      2393
   macro avg       0.83      0.84      0.84      2393
weighted avg       0.85      0.85      0.85      2393



# Predict results for unverified_planets_df:

In [30]:
# Display the 'unverified_planets_df':
unverified_planets_df.head()

Unnamed: 0,Number_of_Planets,Planet_Radius_Earth,Equalibrium_Temp_K,Orbit_Period_Days,Star_Radius_Sun,Star_Temp_K,Star_Mass_Sun,Star_Surface_Gravity,Planet_Star_Radius_Ratio,Status
2,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,2
37,1,12.21,1103.0,4.959319,1.082,5712.0,0.976,4.359,0.103379,2
58,1,7.51,467.0,40.419504,0.781,5446.0,0.714,4.507,0.088069,2
62,2,19.45,734.0,7.240661,0.765,5005.0,0.85,4.595,0.232818,2
63,3,0.55,1272.0,3.435916,1.087,5779.0,0.941,4.339,0.004612,2


In [31]:
# Drop the 'Status' column:
X_new = unverified_planets_df.drop("Status", axis=1)
X_new.head()

Unnamed: 0,Number_of_Planets,Planet_Radius_Earth,Equalibrium_Temp_K,Orbit_Period_Days,Star_Radius_Sun,Star_Temp_K,Star_Mass_Sun,Star_Surface_Gravity,Planet_Star_Radius_Ratio
2,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046
37,1,12.21,1103.0,4.959319,1.082,5712.0,0.976,4.359,0.103379
58,1,7.51,467.0,40.419504,0.781,5446.0,0.714,4.507,0.088069
62,2,19.45,734.0,7.240661,0.765,5005.0,0.85,4.595,0.232818
63,3,0.55,1272.0,3.435916,1.087,5779.0,0.941,4.339,0.004612


In [32]:
# Predict results for 'X_new':
unvpl = lm.predict(X_new)

In [33]:
# Add 'Prediction' column to X_new df with results from previous cell:
X_new['Prediction'] = unvpl.tolist()

#  Add the "Kepler Name (OI)" column to the "planet_predict_df" from "names_df": 
X_new["Kepler_OI_Name"] = names_df

# Rename df:
planet_prediction_df = X_new
print(planet_prediction_df.shape)
planet_prediction_df.head(20)

(1951, 11)


Unnamed: 0,Number_of_Planets,Planet_Radius_Earth,Equalibrium_Temp_K,Orbit_Period_Days,Star_Radius_Sun,Star_Temp_K,Star_Mass_Sun,Star_Surface_Gravity,Planet_Star_Radius_Ratio,Prediction,Kepler_OI_Name
2,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,0,K00753.01
37,1,12.21,1103.0,4.959319,1.082,5712.0,0.976,4.359,0.103379,0,K00760.01
58,1,7.51,467.0,40.419504,0.781,5446.0,0.714,4.507,0.088069,1,K00777.01
62,2,19.45,734.0,7.240661,0.765,5005.0,0.85,4.595,0.232818,0,K00780.02
63,3,0.55,1272.0,3.435916,1.087,5779.0,0.941,4.339,0.004612,1,K00115.03
84,1,7.73,812.0,10.181584,0.836,5988.0,0.885,4.541,0.084708,0,K00797.01
92,1,13.6,643.0,19.620347,0.905,5710.0,0.928,4.492,0.137595,0,K00802.01
112,1,7.61,511.0,34.843986,0.871,5509.0,0.84,4.482,0.080046,1,K00815.01
118,1,30.09,1155.0,4.6409,0.826,6463.0,0.786,4.5,0.333559,0,K00820.01
123,1,72.77,1947.0,1.028437,0.99,6228.0,0.959,4.428,0.673134,0,K00823.01


In [34]:
# Count values in 'Prediction' column:
planet_prediction_df['Prediction'].value_counts()

1    1042
0     909
Name: Prediction, dtype: int64

In [35]:
planet_prediction_df['Number_of_Planets'].value_counts()

1    1431
2     351
3     101
4      52
5      12
7       3
6       1
Name: Number_of_Planets, dtype: int64

In [36]:
# Export the Dataframe as a new CSV file without the index.
planet_prediction_df.to_csv("../data/planet_prediction_df.csv", index=False)
planet_prediction_df.dtypes

Number_of_Planets             int64
Planet_Radius_Earth         float64
Equalibrium_Temp_K          float64
Orbit_Period_Days           float64
Star_Radius_Sun             float64
Star_Temp_K                 float64
Star_Mass_Sun               float64
Star_Surface_Gravity        float64
Planet_Star_Radius_Ratio    float64
Prediction                    int64
Kepler_OI_Name               object
dtype: object

In [40]:
import boto3

import getpass

#Creating Session With Boto3:
session = boto3.Session(
aws_access_key_id= getpass.getpass('Enter aws_access_key_id'),
aws_secret_access_key= getpass.getpass('Enter aws_secret_access_key')
)

#Creating S3 Resource From the Session:
s3 = session.resource('s3')

# Upload file:

result = s3.Bucket('testbucketps42').upload_file('../data/planet_prediction_df.csv', 'planet_prediction_df.csv')

print(result)

Enter aws_access_key_id········
Enter aws_secret_access_key········
None


In [44]:
# Load the data from S3 bucket:
file_path = "https://testbucketps42.s3.us-east-2.amazonaws.com/planet_prediction_df.csv"
planet_prediction_df_s3 = pd.read_csv(file_path)
print(planet_prediction_df_s3.shape)
planet_prediction_df_s3.head()

(1951, 11)


Unnamed: 0,Number_of_Planets,Planet_Radius_Earth,Equalibrium_Temp_K,Orbit_Period_Days,Star_Radius_Sun,Star_Temp_K,Star_Mass_Sun,Star_Surface_Gravity,Planet_Star_Radius_Ratio,Prediction,Kepler_OI_Name
0,1,14.6,638.0,19.89914,0.868,5853.0,0.961,4.544,0.154046,0,K00753.01
1,1,12.21,1103.0,4.959319,1.082,5712.0,0.976,4.359,0.103379,0,K00760.01
2,1,7.51,467.0,40.419504,0.781,5446.0,0.714,4.507,0.088069,1,K00777.01
3,2,19.45,734.0,7.240661,0.765,5005.0,0.85,4.595,0.232818,0,K00780.02
4,3,0.55,1272.0,3.435916,1.087,5779.0,0.941,4.339,0.004612,1,K00115.03
