In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading in the dataset

## Link to the dataset used: [Star Type Classification](https://www.kaggle.com/brsdincer/star-type-classification)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf 

In [None]:
df = pd.read_csv('/kaggle/input/star-type-classification/Stars.csv')

In [None]:
df.describe()

In [None]:
df.head()

# Preprocessing and Data Analysis

In [None]:
# Checking for null values

# Column wise
print(df.isna().sum())

# Row wise
print(df.isna().sum(axis=0))

In [None]:
# Helper functions to plot columns of the dataframw against each other

def linear_plot(x_axis, y_axis, x_label = "X axis", y_label = "Y axis", title = "Linear Plot"):
    %matplotlib inline
    plt.plot(x_axis,y_axis)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.show()
    
    
def scatter_plot(x_axis, y_axis, x_label = "X axis", y_label = "Y axis", title = "Scatter Plot", color_provided = "#ff0000"):
    %matplotlib inline
    plt.scatter(x_axis,y_axis,color = color_provided)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.show()
    
    

In [None]:
# Correlation Matrix

corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')


In [None]:
temp_cols_list = ["A_M", "Type"]
temp_df_1 = df[temp_cols_list]

In [None]:
scatter_plot(temp_df_1["A_M"],temp_df_1["Type"], x_label = "A_M", y_label = "Type")

It would seem that we can just check the A_M values and more or less call it a day.

Or night.

Mostly night.

This was expected after seeing the -0,95 correlation score between the two.

# Logistic Regression

# Steps involved:
* Preprocessing
* Train-test split
* Training the model
* Performance Analysis

In [None]:
df.sample(5)

In [None]:
df.columns

In [None]:
# Taking the relevant columns for training and testing

train_df_columns = df.columns.drop("Type")

print(train_df_columns)

In [None]:
train_df = df[train_df_columns]
test_df = df["Type"]

In [None]:
# Encoding text data into numeric values - as Logistic Regression needs numeric values

def encode_dict(list_of_vals) -> dict :
    d = {}
    idx = 0
    for i in list_of_vals:
        if i not in d:
            d[i] = idx
            idx += 1 
    return d

In [None]:
for i in train_df_columns:
    print(f"Data type of {i} is {train_df[i].dtypes} \n")

In [None]:
colour_dict = encode_dict(train_df['Color'])

In [None]:
train_df['colour_encoded'] = train_df['Color'].apply(lambda x : colour_dict[x]) 

In [None]:
spectral_class_dict = encode_dict(train_df['Spectral_Class']) 

In [None]:
train_df['spectral_class_encoded'] = train_df['Spectral_Class'].apply(lambda x: spectral_class_dict[x])

In [None]:
train_df.drop(["Color","Spectral_Class"],axis=1,inplace=True)

train_df.sample(5)

## Splitting the data and performing Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split

a_train, a_test, b_train, b_test = train_test_split(train_df, test_df, test_size = 0.20, random_state=42)

In [None]:
%timeit

from sklearn.linear_model import LogisticRegressionCV


In [None]:
# Getting the best-fitting logistic regression model

def getBestClassifier(cv_param = 5, num_class = 6, step_size = 500):
    for i in range(num_class):
        num_iter = 1000 + i*step_size
        clf = LogisticRegressionCV(cv = cv_param, random_state=42, max_iter= num_iter, verbose=False).fit(a_train,b_train)
        accr = clf.score(a_test,b_test)
        print(f"With {num_iter} iterations, we have an accuracy of {accr} \n")

In [None]:
getBestClassifier()

We can see that we have an accuracy of about $0,96$


This can be improved by using another classifier, or more rigorous feature extraction, or hyperparameter tuning