# Star Type Classification

Going to take the following approach:

1. Problem definition
2. Data
3. Evaluation
4. Features
5. Modelling
6. Model Evaluation
7. Experientmetion / Improvements

# 1. Problem Definition

How we can use various python based Machine Learning Model and the given parameters to predict if we can predict the Star Type Classification?

# 2. Data

Data from: https://www.kaggle.com/brsdincer/star-type-classification

# 3. Evaluation

As this is a classification problem, we will use the classification metics for evauluting the model

# 4. Features

## inputs / features
    1. Temperature -- K
    2. L -- L/Lo
    3. R -- R/Ro
    4. AM -- Mv
    5. Color -- General Color of Spectrum
    6. Spectral_Class -- O,B,A,F,G,K,M / SMASS - https://en.wikipedia.org/wiki/Asteroid_spectral_types
    7. Type -- Red Dwarf, Brown Dwarf, White Dwarf, Main Sequence , Super Giants, Hyper Giants

## Output / label
    8. Type    
        Red Dwarf - 0
        Brown Dwarf - 1
        White Dwarf - 2
        Main Sequence - 3
        Super Giants - 4
        Hyper Giants - 5

### Math

Lo = 3.828 x 10^26 Watts
(Avg Luminosity of Sun)

Ro = 6.9551 x 10^8 m
(Avg Radius of Sun)

## Standard Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Reading the Dataset

In [None]:
# Local
# df = pd.read_csv('Data/Stars.csv')

# Kaggle
df = pd.read_csv('/kaggle/input/star-type-classification/Stars.csv')
df.head()

## Data Exporation

In [None]:
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(20,10))
plt.title('Value Count of Type')
sns.countplot(data=df, x='Type');

Label is very well balanced.

In [None]:
plt.figure(figsize=(20,10))
plt.title('Value Count of Spectral_Class')
sns.countplot(data=df, x='Spectral_Class');

In [None]:
plt.figure(figsize=(20,10))
plt.title('Value Count of Color')
plt.xticks(rotation=90)
sns.countplot(data=df, x='Color');

In [None]:
sns.pairplot(data=df, hue='Type');

In [None]:
plt.figure(figsize=(20,10))
plt.title('Temperature vs L vs Color')
sns.scatterplot(data=df, x='Temperature', y='L', hue='Color', s=150);

In [None]:
plt.figure(figsize=(20,20))
plt.title('Heatmap of Pearson corrlation')
sns.heatmap(data=round((pd.get_dummies(df)).corr(),2),annot=True);

# 5. Modelling

In [None]:
X = df.drop('Type', axis = 1)
X = pd.get_dummies(X, drop_first = True)
y = df['Type']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model Imports

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

## Baseline Model Scores

In [None]:
from warnings import filterwarnings

In [None]:
filterwarnings('ignore')

In [None]:
def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    
    model_scores = {}
    
    for name, model in models.items():
        model.fit(X_train,y_train)
        model_scores[name] = model.score(X_test,y_test)

    model_scores = pd.DataFrame(model_scores, index=['Score']).transpose()
    model_scores = model_scores.sort_values('Score')
        
    return model_scores

In [None]:
models = {'LogisticRegression': LogisticRegression(max_iter=10000),
          'KNeighborsClassifier': KNeighborsClassifier(),
          'SVC': SVC(),
          'DecisionTreeClassifier': DecisionTreeClassifier(),
          'RandomForestClassifier': RandomForestClassifier(),
          'AdaBoostClassifier': AdaBoostClassifier(),
          'GradientBoostingClassifier': GradientBoostingClassifier(),
          'XGBClassifier': XGBClassifier(),
          'XGBRFClassifier': XGBRFClassifier(),
          'LGBMClassifier':LGBMClassifier()}

In [None]:
baseline_model_scores = fit_and_score(models, X_train, X_test, y_train, y_test)

In [None]:
baseline_model_scores

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(data=baseline_model_scores.sort_values('Score').T)
plt.title('Baseline Model Precision Score')
plt.xticks(rotation=90);

Since a lot models is performing so well, we will try out with the LogisticRegression as that is a simpler model to work with

# 6. Model Evalution

## LogisticRegression

In [None]:
from sklearn.metrics import classification_report, plot_confusion_matrix, plot_roc_curve
from sklearn.model_selection import cross_val_score

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)

### Classification Report

In [None]:
print(classification_report(y_test,y_preds))

### Confustion Matrix

In [None]:
plot_confusion_matrix(model,X_test,y_test)

### Evalution using cross-validation

In [None]:
def get_cv_score(model, X, y, cv=5):
    
    np.random.seed(42)
    cv_accuracy = cross_val_score(model,X,y,cv=cv,
                         scoring='accuracy')
    print(f'Cross Validaion accuracy Scores: {cv_accuracy}')
    print(f'Cross Validation accuracy Mean Score: {cv_accuracy.mean()}')
      
    
    cv_merics = pd.DataFrame({'Accuracy': cv_accuracy.mean()},index=[0])
    
    return cv_merics

In [None]:
cv_merics = get_cv_score(model, X_train, y_train, cv=10)

In [None]:
cv_merics

### Feature Importances

In [None]:
feat_importances = pd.DataFrame(model.coef_[1], index=X.columns)

In [None]:
feat_importances

In [None]:
plt.figure(figsize=(20,10))
plt.xticks(rotation=90)
plt.title('Feature Importances')
sns.barplot(data= feat_importances.sort_values(0).T);

With a Logistic Regression we have a score of 100%

with a Cross Validation accuracy Mean Score: 0.99375