# Space objects practice|

In [6]:
import json
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample

In [None]:
ORIGINAL_PATH = 'data_original.csv'
FILTRATED_PATH = 'data_filtrated.csv'
INPUT_PATH = 'data_input.csv'
PREDICTED_PATH = 'data_predicted.csv'

In [None]:
df_orig = pd.read_csv(ORIGINAL_PATH)

In [None]:
df_filt = df_orig.drop(columns=['Row_id', 'rerun_ID'])

In [None]:
class_labels = {'GALAXY': 0, 'STAR': 1, 'QSO': 2}
reversed_labels = dict(zip(class_labels.values(), class_labels.keys()))

In [None]:
df_filt['class'] = df_filt['class'].replace(class_labels)

In [None]:
df_filt.info()

In [None]:
class_data = {}
for class_label in class_labels:
    class_data[class_label] = df_filt[df_filt['class'] == class_labels[class_label]]

In [None]:
for class_label in class_labels:
    X = class_data[class_label].drop(columns=['class'])
    clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    y_pred = clf.fit_predict(X)
    anomalies = X[y_pred == -1]
    print(f"Anomalies in class {class_label}:")
    print(anomalies)
    df_filt = df_filt[~df_filt.index.isin(anomalies.index)]

In [None]:
df_filt = df_filt.reset_index(drop=True)

In [None]:
def show_statistics(df):
    grouped_data = df.groupby('class')

    num_columns = len(df.columns) - 1
    num_rows = int(np.ceil(num_columns / 2))
    fig, axs = plt.subplots(num_rows, 2, figsize=(10, 5 * num_rows))

    for i, column in enumerate(df.columns[1:]):
        row = i // 2
        col = i % 2
        ax = axs[row, col]
        
        for class_label, class_data in grouped_data:
            sns.histplot(data=class_data, x=column, label=class_label, ax=ax)
        
        ax.set_xlabel(column)
        ax.set_ylabel('Density')
        ax.legend()

    plt.tight_layout()

    plt.show()

In [None]:
show_statistics(df_filt)

In [None]:
df_filt.to_csv(FILTRATED_PATH, index=False)

In [None]:
model = LGBMClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_filt.drop(columns=['class']), df_filt['class'], test_size=0.2, random_state=42
    )

In [None]:
hyperparams = {
    'verbosity': [-1],
    'num_class': [3],
    'num_leaves': [62, 127],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [50, 200],
    'max_depth': [-1]
}

In [4]:
space = {
    'num_leaves': sample(scope.int(hp.quniform('num_leaves', 32, 128, 16))),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.4),
    'n_estimators': sample(scope.int(hp.quniform('n_estimators', 50, 300, 10))),
    'max_depth': sample(scope.int(hp.quniform('n_estimators', 2, 20, 2)))
}

In [5]:
def objective(params):
    model = LGBMClassifier(verbosity=-1, num_class=3,**params)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return {'loss': -score, 'status': STATUS_OK}

In [None]:
trials = Trials()
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

In [None]:
with open('best_params.pkl', 'wb') as f:
    pickle.dump(best_params, f)
with open('best_params.json', 'w') as f:
    json.dump(best_params, f)

# Testing

In [None]:
with open('best_params.json', 'r') as f:
    best_params = json.load(f)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_filt.drop(columns=['class']), df_filt['class'], test_size=0.2, random_state=42, stratify=df_filt['class']
)

In [None]:
model = LGBMClassifier(num_class=3, **best_params)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, np.argmax(y_pred, axis=1))

In [None]:
print(f"Accuracy: {accuracy}")

In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Inputs

In [None]:
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
df_input = pd.read_csv(INPUT_PATH)

In [None]:
y_pred = model.predict(df)

In [None]:
print(y_pred)

In [None]:
df_input['class'] = np.argmax(y_pred, axis=1).apply(lambda x: reversed_labels[x])

In [None]:
df_input.to_csv(PREDICTED_PATH, index=False)