## 1. Importing Libraries

### 1.1 Importing necassary libraries and adding necessary configurations

In [1]:
# For numeric calculations
import numpy as np
import pandas as pd

# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# For balancing the dataset
from imblearn.over_sampling import SMOTE

# For machine learning models
from sklearn.model_selection import train_test_split

# Ignore warnings
import warnings

In [2]:
# Configurations
%matplotlib inline
warnings.filterwarnings('ignore')

## 2. Import data

### 2.1 Importing train and test data

In [3]:
# Importing train data
train_dataset_path = "./dataset/exoTrain.csv"
train_df = pd.read_csv(train_dataset_path, sep=",")

# Importing test data
test_dataset_path = "./dataset/exoTest.csv"
test_df = pd.read_csv(test_dataset_path, sep=",")

### 2.2 Combining the datasets

In [7]:
# Combining two dataframes for further investigation
df = pd.concat([train_df, test_df])
df.head(5)

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,2,93.85,83.81,20.1,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,2,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
2,2,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67
3,2,326.52,347.39,302.35,298.13,317.74,312.7,322.33,311.31,312.42,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
4,2,-1107.21,-1112.59,-1118.95,-1095.1,-1057.55,-1034.48,-998.34,-1022.71,-989.57,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54


In [9]:
df.shape

(5657, 3198)

In [10]:
print(f'Dataset consists of {df.shape[0]} rows and {df.shape[1]} columns')

Dataset consists of 5657 rows and 3198 columns


### 2.3 Exploring basic statistics

In [23]:
# Get basic statistics for all columns
stats = df.describe().T

# Displaying statistics for the first 10 columns
stats.iloc[:10, :]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LABEL,5657.0,1.007424,0.085852,1.0,1.0,1.0,1.0,2.0
FLUX.1,5657.0,181.877969,20781.963875,-227856.26,-39.84,-0.47,51.71,1439240.5
FLUX.2,5657.0,189.98796,21054.656517,-315440.76,-37.05,-0.76,46.15,1453319.0
FLUX.3,5657.0,185.974637,21157.44849,-284001.76,-35.73,-0.82,45.47,1468429.0
FLUX.4,5657.0,214.941396,21459.24865,-234006.87,-33.18,-0.42,42.5,1495750.0
FLUX.5,5657.0,193.912781,22247.554224,-423195.62,-30.5,-0.78,41.56,1510937.0
FLUX.6,5657.0,205.670115,23204.624477,-597552.12,-31.63,-1.16,35.88,1508152.5
FLUX.7,5657.0,157.200847,23230.965053,-672404.56,-27.41,-1.28,34.27,1465743.0
FLUX.8,5657.0,175.581633,22083.070436,-579013.56,-26.73,-0.94,31.97,1416827.0
FLUX.9,5657.0,158.951252,20323.48912,-397388.24,-26.44,-0.86,30.55,1342888.5


### Let's change the labels from 1 -> 0 and 2 -> 1 to maintain consistency

In [None]:
categories = {1: 0, 2: 1}
try: 
    df.LABEL = [categories[item] for item in df.LABEL]
except KeyError:
    print(f"{KeyError} Keys does not match in LABEL column")

In [None]:
df.describe().T

In [None]:
# Visulization #1 :- Heatmap for any null values
plt.figure(figsize = (4,4))

ax = plt.axes()
sns.heatmap(df.isnull().T, ax=ax)
plt.show()

From the heatmap we can see. There are no missing values in the data.

## Class distribution

In [None]:
df["LABEL"].value_counts()

print(f"From {len(df["LABEL"])} rows, there are {df["LABEL"].value_counts()[0]} non-exoplanets and {df["LABEL"].value_counts()[1]} confirmed exoplanets in the dataset")

#### The class distribution looks imbalanced. Let's explore it more through visualizations

Pie Chart Distribution

In [None]:
# Visulization #2 :- Plotting class distribution using pie chart
fig, ax = plt.subplots()

ax.pie(df["LABEL"].value_counts(), labels=["Non-Exoplanet", "Exoplanet"], autopct='%1.1f%%');
plt.title("Class distribution");
plt.show();

In [None]:
# Visulization #3 :- Number of Stars with Confirmed Exoplanets or Not
plt.title('CLASS DISTRIBUTIONS \n (0: Not Exoplanet || 1: Exoplanet)', fontsize=20, font='times new roman', weight='bold', color='black')
ax = sns.countplot(data=df, x="LABEL")
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
# Setting figure size
plt.figure(figsize=(10, 5))

# Non-exoplanet star light flux over time
row_index = 150 # Selecting random non-exoplanet row
fluxes_non_exo = df.iloc[row_index,1:(df.shape[1]//2)].values

plt.subplot(1, 2, 1)
plt.plot(fluxes_non_exo, color = "red")
plt.xlabel('Time')
plt.ylabel('Flux Level')
plt.title("Non Exoplanet")

# Exoplanet star light flux over time
row_index = 21 # Selecting random exoplanet row
fluxes_exo = df.iloc[row_index,1:(df.shape[1]//2)].values

plt.subplot(1, 2, 2)
plt.plot(fluxes_exo, color = "red")
plt.xlabel('Time')
plt.ylabel('Flux Level')
plt.title("Exoplanet")

plt.tight_layout()
plt.suptitle('Flux variation over time') 
plt.subplots_adjust(top=0.88)
plt.show()

In [None]:
# Over sampling the imbalanced class
smote = SMOTE(sampling_strategy=1)
X_res, y_res = smote.fit_resample(X, y)

In [None]:
y_res.value_counts().plot.pie(autopct = "%.2f")