In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Star Type Classification
For comparing all models of ML
It can be used for prediction

   * Temperature -- K
    L -- L/Lo
    R -- R/Ro
    AM -- Mv
    Color -- General Color of Spectrum
    Spectral_Class -- O,B,A,F,G,K,M / SMASS - https://en.wikipedia.org/wiki/Asteroid_spectral_types
    Type -- Red Dwarf, Brown Dwarf, White Dwarf, Main Sequence , Super Giants, Hyper Giants

    TARGET:
    Type

from 0 to 5

    Red Dwarf - 0
    Brown Dwarf - 1
    White Dwarf - 2
    Main Sequence - 3
    Super Giants - 4
    Hyper Giants - 5
    MATH:
*
Lo = 3.828 x 10^26 Watts
(Avg Luminosity of Sun)
Ro = 6.9551 x 10^8 m
(Avg Radius of Sun)**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

import pandas_profiling as pdp
import gc
gc.enable()
from scipy import stats
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# EDA

In [None]:
data=pd.read_csv('../input/star-type-classification/Stars.csv')

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data.Type

In [None]:
sns.histplot(data=data,hue='Type',palette="Set1", x="Temperature", kde=True)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18, 14))
sns.histplot(data=data,hue='Type',palette="Set1", x="L", kde=True,ax=axes[0][0])
sns.histplot(data=data,hue='Type',palette="Set1", x="R", kde=True,ax=axes[0][1])
sns.histplot(data=data,hue='Type',palette="Set1", x="A_M", kde=True,ax=axes[1][0])
sns.histplot(data=data,hue='Type',palette="Set1", x="Spectral_Class", kde=True,ax=axes[1][1])

In [None]:
fig, axes = plt.subplots(figsize=(20, 10))
sns.histplot(data=data,hue='Type',palette="Set1", x="Color", kde=True)
plt.xticks(rotation=45)

In [None]:
report = pdp.ProfileReport(data, title="Cardiovascular disease Report",minimal=True)
report

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data.corr(),annot=True)

# Preprocessing

In [None]:
duplicado = data.duplicated()
print(duplicado.any())
print(data[duplicado])

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Temperature"],color='red')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["R"],color='red')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["L"],color='red')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["A_M"],color='red')

In [None]:
data.drop(data[(data['Temperature'] > data['Temperature'].quantile(0.975)) | (data['Temperature'] < data['Temperature'].quantile(0.025))].index,inplace=True)
data.drop(data[(data['R'] > data['R'].quantile(0.975)) | (data['R'] < data['R'].quantile(0.025))].index,inplace=True)
data.drop(data[(data['L'] > data['L'].quantile(0.975)) | (data['L'] < data['L'].quantile(0.025))].index,inplace=True)
data.drop(data[(data['A_M'] > data['A_M'].quantile(0.975)) | (data['A_M'] < data['A_M'].quantile(0.025))].index,inplace=True)

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Temperature"],color='red')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["R"],color='red')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["L"],color='red')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["A_M"],color='red')

In [None]:
le = preprocessing.LabelEncoder()
selected_col = ['Color','Spectral_Class']

le.fit(data[selected_col].values.flatten())

data[selected_col] = data[selected_col].apply(le.fit_transform)

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data.corr(),annot=True)

In [None]:
sns.pairplot(data, hue='Type', diag_kind='hist',palette="Set1")
plt.show()

In [None]:
robust_sc = preprocessing.RobustScaler()
standard_sc = preprocessing.StandardScaler() 
minmax_sc = preprocessing.MinMaxScaler() 

In [None]:
X = data.drop('Type',axis=1)
y = data["Type"]

In [None]:
resultado = []

for x in [robust_sc,standard_sc,minmax_sc]:
    scaler = x.fit(X)
    X_new = x.transform(X)
    tree = DecisionTreeClassifier(max_depth=10,random_state=42)
    tree.fit(X_new,y)
    y_pred = tree.predict(X_new)
    f1sc=f1_score(y, y_pred, average='weighted')
    rauc=(y, y_pred)
    resultado.append(f1sc)
    print("El escalado Utilizado--->",x)
    print("f1 segun el tipo de estrategia:",f1sc)
    print("----------------------------------------")

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [None]:
standard_sc.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Classifiers

In [None]:
gbm = XGBClassifier(verbosity=1)
params_xgb = {
        "n_estimators":[150,250,500],
        "learning_rate":[0.01,0.1],
        'gpu_id': [0],
        "predictor":["gpu_predictor"],
        'tree_method': ['gpu_hist'],
        "updater":["grow_gpu_hist"],
        "sampling_method":["gradient_based"],
        "updater":["grow_gpu_hist"]
}

In [None]:
model_xgb = GridSearchCV(gbm,param_grid=params_xgb, cv=5,n_jobs=-1)
model_xgb.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_xgb.best_params_))
print("Best Score: "+str(model_xgb.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_xgb.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_xgb = model_xgb.predict(X_train)
y_test_pred_xgb = model_xgb.predict(X_test)

In [None]:
print(classification_report(y_test,y_test_pred_xgb))

In [None]:
plot_confusion_matrix(model_xgb, X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neural_network import MLPClassifier

In [None]:
model_nb = GaussianNB()
model_rf = RandomForestClassifier(random_state=42, n_jobs=-1)
model_xgb = XGBClassifier(random_state=42)
model_lgbm = LGBMClassifier(random_state=42, n_jobs=-1)
model_knn = KNeighborsClassifier(n_jobs=-1)

In [None]:
models = []

models.append(('NB',model_nb))
models.append(('RF',model_rf))
models.append(('XGB',model_xgb))
models.append(('LGBM',model_lgbm))
models.append(('KNN',model_knn))

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []
names = []

for name, model in models:
    scores = cross_val_score(model,X_train, y_train, scoring='f1_weighted', cv=cv, n_jobs=-1)
    accuracy = scores.mean()
    std = scores.std()
    print(f"{name} : Mean ROC {accuracy} STD:({std})")
    results.append(scores)
    names.append(name)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []
names = []

for name, model in models:
    scores = cross_val_score(model,X_test, y_test, scoring='f1_weighted', cv=cv, n_jobs=-1)
    accuracy = scores.mean()
    std = scores.std()
    print(f"{name} : Mean ROC {accuracy} STD:({std})")
    results.append(scores)
    names.append(name)

In [None]:
clf = MLPClassifier(random_state=42)
params_MLP = {
        "hidden_layer_sizes":[64,128,256],
        "activation":["identity", "logistic", "tanh", "relu"],
        'solver': ["lbfgs", "sgd", "adam"],
        "learning_rate":["constant", "invscaling", "adaptive"],
        'max_iter': [100,200,300],
        "warm_start":[True]
    }

In [None]:
model_MLP = GridSearchCV(clf,param_grid=params_MLP, cv=3,n_jobs=-1)
model_MLP.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_MLP.best_params_))
print("Best Score: "+str(model_MLP.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_MLP.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_MLP = model_MLP.predict(X_train)
y_test_pred_MLP = model_MLP.predict(X_test)

In [None]:
print(classification_report(y_test,y_test_pred_xgb))

In [None]:
plot_confusion_matrix(model_MLP, X_test, y_test)