In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

# Read data

In [None]:
# Read data

row_data = pd.read_csv('../input/orbitclassification/classast - pha.csv')
row_data.head()

# Exploring data

In [None]:
# Checking for duplicate lines

row_data.duplicated().unique()

In [None]:
# Checking for empty cells in data

row_data.isnull().sum() 

In [None]:
# Сlass balance check

plt.figure(figsize=(20,5))
sns.countplot(x = row_data['class'])
plt.show()

In [None]:
# The classes are highly unbalanced. This must be taken into account.

In [None]:
# Numerical features exploration

numerical_features = row_data.iloc[:, :11].columns.to_list()

for column_name in numerical_features:
    plt.figure(figsize=(15,10))
    sns.distplot(x = row_data[column_name])
    plt.xlabel(column_name)
    plt.show()
    
    plt.figure(figsize=(15,3))
    sns.boxplot(x = row_data[column_name])
    plt.show()

In [None]:
# The boxplot shows strong outliers from a (AU), Q (AU) and P (yr).
# Let's Explore them in more detail

In [None]:
outliers_a_AU = row_data.loc[row_data['a (AU)'] > 15]
print('Orbit class with a (AU) > 15:', ', '.join([str(i) for i in outliers_a_AU['class'].unique()]))
print(outliers_a_AU)

In [None]:
# All outliers belong to the same object
# Deleted outliers object

row_data = row_data.loc[row_data['a (AU)'] < 15]

In [None]:
# Renaming the target variable

row_data['class'] = row_data['class'].replace('APO*', 1)
row_data['class'] = row_data['class'].replace('ATE*', 2)
row_data['class'] = row_data['class'].replace('AMO*', 3)
row_data['class'] = row_data['class'].replace('APO', 4)
row_data['class'] = row_data['class'].replace('IEO*', 5)
row_data['class'] = row_data['class'].replace('ATE', 6)

In [None]:
# Features correlation exploration

# Pearson correlation
plt.figure(figsize=(10,8))
corr = row_data.corr(method='pearson')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, annot=True, fmt= '.2f', cmap='RdBu', mask=mask)
plt.show()

In [None]:
# Spearman correlation
plt.figure(figsize=(10,8))
corr = row_data.corr(method='spearman')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, annot=True, fmt= '.2f', cmap='RdBu', mask=mask)
plt.show()

In [None]:
# The data show a strong correlation between a(AU) and Q(AU); a(AU) and P(yr);
# Q(AU) and P(yr); e and Q(AU).

# The correlation between a(AU) (Semi-major axis) and Q(AU) (Aphelion distance) is explained by
# the fact that the semi-major axis (a) is the average of the aphelion (Q) and 
# perihelion (q) distances. Also Q can be calculated from Q = a(1+e).

# The correlation between a(AU) (Semi-major axis) and P(yr) (Orbital period) is explained by 
# the fact that they are related by the relationship P = 2*Pi*sqrt(a^3/μ)

# The correlation between Q(AU) (Aphelion distance) and P(yr) (Orbital period) is explained 
# by the fact that Q(AU) (as I wrote above) is related to a(AU), which is related to P(yr)

# The correlation between e (Eccentricity) and Q(AU) (Aphelion distance) is explained by 
# the fact that they are related by the relationship Q = a(1+e).

# From all of the above, it follows that in the work you can ignore such parameters as 
# P(yr) (Orbital period) and Q(AU) (Aphelion distance). Both of these parameters can 
# be calculated from a(AU) (Semi-major axis) and e (Eccentricity). The same can be 
# said about q(AU) (Perihelion), but it does not show a strong correlation with other 
# parameters, so it can be written.

In [None]:
# Removing columns P(yr) and Q(AU)

row_data.drop(['Q (AU)', 'P (yr)'], axis=1, inplace=True)
row_data.head()

# Preparing data

In [None]:
data_X = row_data.iloc[:, :9]  # data without target variable
data_y = row_data['class']  # target variable

# Balanced data
oversample = SMOTE(k_neighbors = 4)
data_X_balanced, data_y_balanced = oversample.fit_resample(data_X, data_y.ravel())

# Split data in to train and test sets

X_train, X_test, y_train, y_test = train_test_split(
    data_X_balanced, data_y_balanced, test_size=0.33, random_state=42, stratify=data_y_balanced)

# Scaling data

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Trying out different models using kFold cross-validation

In [None]:
models = []

models.append(('KNN',KNeighborsClassifier(n_jobs=-1)))
models.append(('LR',LogisticRegression(random_state=42,n_jobs=-1)))
models.append(('DT',DecisionTreeClassifier(random_state=42)))
models.append(('Bag_DT',BaggingClassifier(DecisionTreeClassifier(random_state=42), random_state=42, n_jobs=-1)))
models.append(('RF',RandomForestClassifier(random_state=42, n_jobs=-1)))
models.append(('GBC',GradientBoostingClassifier(random_state=42)))

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models:
    scores = cross_val_score(model, X_train, y_train, scoring='f1_weighted', cv=kf, n_jobs=-1)
    accuracy = scores.mean()
    std = scores.std()
    print(f"{name} : Mean F1 {round(accuracy, 3)} STD:({round(std, 3)})")

In [None]:
# The best results were shown by DecisionTreeClassifier, BaggingClassifier,
# RandomForestClassifier and GradientBoostingClassifier.
# Let's check them on the test set

In [None]:
# Try DecisionTreeClassifier on test set

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_predicted = dt.predict(X_test)

# Creating a confusion matrix

conf_matix = pd.crosstab(y_test, y_predicted)

sns.heatmap(conf_matix, cmap='Greys', annot=True, 
            linecolor='black', square='True',
            linewidths=0.2, xticklabels=('APO*', 'ATE*', 'AMO*', 'APO', 'IEO*', 'ATE'),
            yticklabels=('APO*', 'ATE*', 'AMO*', 'APO', 'IEO*', 'ATE'))
plt.ylabel("Real class of orbit")
plt.xlabel("Predicted class of orbit") 
plt.show()

In [None]:
# Try BaggingClassifier on test set

bg = BaggingClassifier(DecisionTreeClassifier(random_state=42))
bg.fit(X_train, y_train)
y_predicted_bg = bg.predict(X_test)

# Creating a confusion matrix

conf_matix = pd.crosstab(y_test, y_predicted_bg)

sns.heatmap(conf_matix, cmap='Greys', annot=True, 
            linecolor='black', square='True',
            linewidths=0.2, xticklabels=('APO*', 'ATE*', 'AMO*', 'APO', 'IEO*', 'ATE'),
            yticklabels=('APO*', 'ATE*', 'AMO*', 'APO', 'IEO*', 'ATE'))
plt.ylabel("Real class of orbit")
plt.xlabel("Predicted class of orbit") 
plt.show()

In [None]:
# Try RandomForestClassifier on test set

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_predicted_rf = rf.predict(X_test)

# Creating a confusion matrix

conf_matix = pd.crosstab(y_test, y_predicted_rf)

sns.heatmap(conf_matix, cmap='Greys', annot=True, 
            linecolor='black', square='True',
            linewidths=0.2, xticklabels=('APO*', 'ATE*', 'AMO*', 'APO', 'IEO*', 'ATE'),
            yticklabels=('APO*', 'ATE*', 'AMO*', 'APO', 'IEO*', 'ATE'))
plt.ylabel("Real class of orbit")
plt.xlabel("Predicted class of orbit") 
plt.show()

In [None]:
# Try GradientBoostingClassifier on test set

gb = GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=50, random_state=42)
gb.fit(X_train, y_train)
y_predicted_gb = gb.predict(X_test)

# Creating a confusion matrix

conf_matix = pd.crosstab(y_test, y_predicted_gb)

sns.heatmap(conf_matix, cmap='Greys', annot=True, 
            linecolor='black', square='True',
            linewidths=0.2, xticklabels=('APO*', 'ATE*', 'AMO*', 'APO', 'IEO*', 'ATE'),
            yticklabels=('APO*', 'ATE*', 'AMO*', 'APO', 'IEO*', 'ATE'))
plt.ylabel("Real class of orbit")
plt.xlabel("Predicted class of orbit") 
plt.show()

In [None]:
# Best result on test set shown by RandomForestClassifier