In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Star Type Classification

### Features

1. Temperature (K)
2. Relative Luminosity (L/Lo)
3. Relative Radius (R/Ro)
4. AM (Mv)
5. Color => General Color of Spectrum
6. Spectral_Class => O,B,A,F,G,K,M

### Target

* Red Dwarf - 0
* Brown Dwarf - 1
* White Dwarf - 2
* Main Sequence - 3
* Super Giants - 4
* Hyper Giants - 5

##### MATH:

* Lo = 3.828 x 10^26 Watts (Avg Luminosity of Sun)
* Ro = 6.9551 x 10^8 m (Avg Radius of Sun)

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mn

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier

## Reading data and exploration

In [None]:
df = pd.read_csv('/kaggle/input/star-type-classification/Stars.csv')
df

### Checking for null values

Dataset doesnt have any null values 

In [None]:
mn.matrix(df)

In [None]:
df.describe()

### Data cleaning
* Using logarithmic scale on L and R as they are very highly spreaded
* Cleaning the 'Color' column
* Assigning numerical values to 'Colors' and 'Spectral_Class' columns

In [None]:
df['logL'] = np.log1p(df['L'])
df['logR'] = np.log1p(df['R'])

In [None]:
df['Color'] = df['Color'].replace(['White','Whitish'],'white')
df['Color'] = df['Color'].replace(['Blue White','Blue-White','Blue white','Blue-white'],'blue_white')
df['Color'] = df['Color'].replace(['Red'],'red')
df['Color'] = df['Color'].replace(['Yellowish White','yellow-white','White-Yellow'],'yellow_white')
df['Color'] = df['Color'].replace(['yellowish','Yellowish'],'yellow')
df['Color'] = df['Color'].replace(['Blue'],'blue')
df['Color'] = df['Color'].replace(['Orange'],'orange')
df['Color'] = df['Color'].replace(['Pale yellow orange'],'pale_yellow_orange')
df['Color'] = df['Color'].replace(['Orange-Red'],'orange_red')

In [None]:
color_dict = {'red':0, 'blue':1, 'blue_white':2, 'white':3, 'yellow_white':4, 'yellow':5, 'orange':6, 
              'pale_yellow_orange':5, 'orange_red':6}
df['Color'] = df['Color'].map(color_dict)

In [None]:
spec_dict = {'M':1, 'B': 2, 'O':3, 'A':4, 'F':5, 'K':6, 'G':7}
df['Spectral_Class'] = df['Spectral_Class'].map(spec_dict)
df.drop(columns=['L','R'], inplace = True)
df

## Data Visualization

In [None]:
fig,ax = plt.subplots(3,2, figsize=(10,10))

ax[0,0].plot(df['Temperature'],'r')
ax[0,0].set_title('Temperature')
ax[0,1].plot(df['logL'],'g')
ax[0,1].set_title('logL')
ax[1,0].plot(df['logR'],'y')
ax[1,0].set_title('logR')
ax[1,1].plot(df['A_M'],'b')
ax[1,1].set_title('A_M')
ax[2,0].plot(df['Color'],'grey')
ax[2,0].set_title('Color')
ax[2,1].plot(df['Spectral_Class'],'black')
ax[2,1].set_title('Spectral_Class')

fig.tight_layout()
plt.show()

In [None]:
p = sns.pairplot(data=df,hue='Type')

In [None]:
h = sns.heatmap(df.corr(), annot=True)

## Model Training

In [None]:
X = df.drop(['Type'], axis = 1)
Y = df['Type']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [None]:
models = {'LogisticRegression': LogisticRegression(max_iter=10000),
          'KNeighborsClassifier': KNeighborsClassifier(),
          'SVC': SVC(),
          'DecisionTreeClassifier': DecisionTreeClassifier(),
          'RandomForestClassifier': RandomForestClassifier(),
          'AdaBoostClassifier': AdaBoostClassifier(),
          'GradientBoostingClassifier': GradientBoostingClassifier(),
          'LGBMClassifier':LGBMClassifier()}

In [None]:
def fit_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}
    
    for model_name, model in models.items():
        model.fit(X_train,y_train)
        model_scores[model_name] = model.score(X_test,y_test)

    model_scores = pd.DataFrame(model_scores, index=['Score']).transpose()
    model_scores = model_scores.sort_values('Score')
        
    return model_scores

In [None]:
model_scores = fit_score(models, X_train, X_test, Y_train, Y_test)
model_scores

### Logistic Regression
* From above data we know Logistic Regression has best score (97.5%). Models having score 1 are overfitting

In [None]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, Y_train)
y_preds = model.predict(X_test)

### Cross validiation on Logistic Regression

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
def cv_score(model, X, Y, cv=5):
    np.random.seed(42)
    cv_mean={}
    cv_acc = cross_val_score(model,X,Y,cv=cv,scoring='accuracy')
    cv_mean['cross_validation_mean'] = cv_acc.mean()
    return cv_mean

In [None]:
cv_mean = cv_score(model, X_train, Y_train, cv=10)
cv_mean

## Logistic regression model gives a cross validation accuracy of 98%