In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Descrição

Classificação de estrela de acordo com as seguintes características:

* Visual Apparent Magnitude of the Star (Vmag)
* Distance Between the Star and the Earth (Plx)
* Standard Error of Plx (e_Plx)
* B-V color index (B-V)
    * A hot star has a B-V color index close to 0 or negative, while a cool star has a B-V color index close to 2.0. Other stars are somewhere in between.
* Spectral type (SpType)
* Absolute Magnitude of the Star (Amag)


## Classes
* Dwarf (0)
* Giant (1)

## Arquivos
* Treinamento: Star3642_balanced.csv
    * 3642 Estrelas
    * 1821 Dwarves
    * 1821 Giants
* Classificação: Star39552_balanced.csv
    * 39552 Estrelas
    * 19776 Dwarves
    * 19776 Giants

# Treinamento
## Importando dados


In [None]:
training_data_csv = pd.read_csv("../input/star-categorization-giants-and-dwarfs/Star3642_balanced.csv")


## Valores Nulos

In [None]:
training_data_csv.isnull().sum()

## Descrição das colunas

In [None]:
training_data_csv.describe()

In [None]:
training_data_csv.describe(exclude=[np.number])

In [None]:
training_data_csv.dtypes

## Tipos espectrais (SpType)

A letra inicial que descreve o tipo espectral representa um intervalo de temperatura. Usando somente essa letra podemos agrupar os 584 valores únicos em 8 categorias:

| Class | Effective Temperature  | Chromaticity  |
| ------ |:--------------------------:|:---------------:|
| O       | ≥ 30,000 K                      | blue                 |
| B        | 10,000–30,000 K          | blue white      |
| A       | 7,500–10,000 K              | white              |
| F        | 6,000–7,500 K               |  yellow white |
| G       | 5,200–6,000 K               |  yellow            |
| K       | 3,700–5,200 K                | light orange   |
| M      | 2,400–3,700 K               | orange red     |


In [None]:
for i in range(len(training_data_csv['SpType'])):
    training_data_csv.loc[i, 'SpType'] = training_data_csv['SpType'][i][0]
print(training_data_csv['SpType'])

In [None]:
print(training_data_csv.describe(exclude=[np.number]))
print("Categorias:", training_data_csv['SpType'].unique())

Excluindo as 2 linhas com a categoria C que não está na tabela de tipos espectrais.

In [None]:
training_data_csv = training_data_csv.loc[training_data_csv['SpType'] != 'C']
print("Categorias:", training_data_csv['SpType'].unique())

In [None]:
training_data = training_data_csv.iloc[:, :-1]
training_target = training_data_csv.iloc[:, -1]

print(training_data)
print('\n')
print(training_target)

## Visualização

Gerando uma visualização gráfica da distribuição de cada coluna.


In [None]:
print(training_data["Vmag"].plot.kde())


In [None]:
print(training_data["Plx"].plot.kde())


In [None]:
print(training_data["e_Plx"].plot.kde())


In [None]:
print(training_data["B-V"].plot.kde())


In [None]:
print(training_data["Amag"].plot.kde())

### Matriz de correlação

In [None]:
corr = training_data.corr()
fig = plt.figure(figsize=(10, 10))
sns.heatmap(corr, annot=True, vmin=-1, vmax=1, cmap=sns.color_palette("light:#5A9", as_cmap=True))
plt.title("Data Correlation Heatmap", fontsize=14)
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(28,16))

sns.scatterplot(data = training_data_csv, x = 'Plx', y= 'B-V', ax = axes[0][0], hue = 'TargetClass')
axes[0,0].set_xlabel('Distance Between the Star and the Earth')

sns.scatterplot(data = training_data_csv, x = 'e_Plx',  y= 'B-V', ax = axes[0][1], hue = 'TargetClass')
axes[0,1].set_xlabel('Standard error of Plx')


sns.scatterplot(data = training_data_csv, x = 'Vmag', y = 'Amag', ax = axes[0][2], hue='TargetClass')
axes[0,2].set_xlabel('Apparent Magnitude')

sns.scatterplot(data = training_data_csv, x = 'Vmag', y = 'B-V', ax = axes[1][0], hue='TargetClass')
axes[1,0].set_xlabel('Apparent Magnitude')

sns.scatterplot(data = training_data_csv, x = 'Vmag', y = 'Plx', ax = axes[1][1], hue='TargetClass')
axes[1,1].set_xlabel('Apparent Magnitude')

sns.scatterplot(data = training_data_csv, x = 'Vmag', y = 'e_Plx', ax = axes[1][2], hue='TargetClass')
axes[1,2].set_xlabel('Apparent Magnitude')

sns.scatterplot(data = training_data_csv, x = 'Amag', y = 'Plx', ax = axes[2][0], hue = 'TargetClass')
axes[2,0].set_xlabel('Absolute Magnitude')

sns.scatterplot(data = training_data_csv, x = 'Amag', y = 'e_Plx', ax = axes[2][1], hue = 'TargetClass')
axes[2,1].set_xlabel('Absolute Magnitude')

sns.scatterplot(data = training_data_csv, x = 'Amag', y = 'B-V', ax = axes[2][2], hue = 'TargetClass')
axes[2,2].set_xlabel('Absolute Magnitude')

In [None]:
cp_map = {
    "O": 0,
    "B": 1,
    "A": 2,
    "F": 3,
    "G": 4,
    "K": 5,
    "M": 6,
}
training_data['SpType'] = training_data['SpType'].map(cp_map, na_action=None)
print(training_data['SpType'])

In [None]:
cp_map = {
    0: "O",
    1: "B",
    2: "A",
    3: "F",
    4: "G",
    5: "K",
    6: "M",
}
fig = plt.figure(figsize=(10,8))
sns.countplot(training_data['SpType'].map(cp_map))
plt.xlabel("SpType", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("SpTypes Distribution", fontsize=15)
plt.show()

## Importando base de testes

In [None]:
test_data_csv = pd.read_csv("../input/star-categorization-giants-and-dwarfs/Star39552_balanced.csv")


In [None]:
test_data_csv.isnull().sum().sum

In [None]:
test_data_csv.isnull().values.any()

In [None]:
for i in range(len(test_data_csv['SpType'])):
    test_data_csv.loc[i, 'SpType'] = test_data_csv['SpType'][i][0]
print(test_data_csv['SpType'])

In [None]:
test_data_csv = test_data_csv.loc[test_data_csv['SpType'] != 'C']
test_data_csv = test_data_csv.loc[test_data_csv['SpType'] != 'W']
test_data_csv = test_data_csv.loc[test_data_csv['SpType'] != 'N']
print("Categorias:", test_data_csv['SpType'].unique())

In [None]:
test_data = test_data_csv.iloc[:, :-1]
test_target = test_data_csv.iloc[:, -1]

print(test_data)
print('\n')
print(test_target)

In [None]:
cp_map = {
    "O": 0,
    "B": 1,
    "A": 2,
    "F": 3,
    "G": 4,
    "K": 5,
    "M": 6,
}
test_data['SpType'] = test_data['SpType'].map(cp_map, na_action=None)
print(test_data['SpType'])

In [None]:
# training_data.drop(['SpType'], inplace=True, axis=1)
# test_data.drop(['SpType'], inplace=True, axis=1)

## Classificação Bayesiana


In [None]:
gnb = GaussianNB()

model = gnb.fit(training_data, training_target)

preds = gnb.predict(test_data)
print(preds)

print(accuracy_score(test_target, preds))

In [None]:
matrix = confusion_matrix(test_target, preds, labels=[0,1])
plot_confusion_matrix(gnb, test_data, test_target,labels=[0,1],cmap='Blues')

In [None]:
print(classification_report(test_target, preds))

## Regressão Logística

In [None]:
logreg = LogisticRegression()
logreg.fit(training_data, training_target)
log_pred = logreg.predict(test_data)

In [None]:
matrix = confusion_matrix(test_target, log_pred)
plot_confusion_matrix(logreg, test_data, test_target,labels=[0,1],cmap='Blues')

In [None]:
print(classification_report(test_target, log_pred))