# Task for Today  

***

## Star Type Prediction

Given *data about stars*, let's try to predict the **type** of a given star.

We will use a logistic regression model to make our predictions and evaluate the model performance using both test set evaluation and K-fold evaluation.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data = pd.read_csv('../input/star-dataset/6 class csv.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing


In [None]:
data['Star color'].unique()

In [None]:
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Fix color values
    color_mapping = {
        'white': 'White',
        'Blue ': 'Blue',
        'Blue white': 'Blue White',
        'Blue-white': 'Blue White',
        'Blue white ': 'Blue White',
        'Blue-White': 'Blue White',
        'yellow-white':'Yellowish White',
        'White-Yellow':'Yellowish White',
        'yellowish': 'Yellowish'
    }
    df['Star color'] = df['Star color'].replace(color_mapping)
    
    # One-hot encode
    df = onehot_encode(df, column='Star color', prefix="Color")
    df = onehot_encode(df, column='Spectral Class', prefix="Class")
    
    # Split df into X and y
    y = df['Star type']
    X = df.drop('Star type', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Test Set Evaluation

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

print("Test Set Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))

# K-Fold Evaluation

In [None]:
kf = KFold(n_splits=5)

print("Split Indices")

for i, (train_idx, test_idx) in enumerate(kf.split(X_train)):
    print(f"\nSplit {i + 1}:\n--------")
    print("\nTrain:\n" + str(train_idx))
    print("\nTest:\n" + str(test_idx) + "\n")

In [None]:
results = []

for train_idx, test_idx in kf.split(X_train):
    train_set = (X_train.iloc[train_idx, :], y_train.iloc[train_idx])
    test_set = (X_train.iloc[test_idx, :], y_train.iloc[test_idx])
    
    model = LogisticRegression()
    model.fit(train_set[0], train_set[1])
    results.append(model.score(test_set[0], test_set[1]))

print("K-Fold Accuracies:")
for i, result in enumerate(results):
    print("Model {}: {:.2f}%".format(i + 1, result * 100))

In [None]:
print("Average K-Fold Accuracy: {:.2f}%".format(np.array(results).mean() * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/75pwmIAzxKs