In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set()

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/star-type-classification/Stars.csv')

print(df.shape)
df.head()

In [None]:
df.describe()

# EDA

## Target

In [None]:
target = 'Type'

### Distribution

In [None]:
df.groupby(target)['L'].count().plot.bar()

plt.ylabel('Count')
plt.show()

Dataset is balanced

## Missing Values

In [None]:
df.isnull().sum()

## Numerical Features

In [None]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O' and feature != target]

df[num_features].head()

### Distribution

In [None]:
for feature in num_features:
    iqr = stats.iqr(df[feature], interpolation = 'midpoint')
    h = (2 * iqr) / (len(df[feature]) ** (1/3))
    bins = round((max(df[feature]) - min(df[feature])) / h)
    
    df[feature].hist(bins = bins)
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.show()

### Outliers

In [None]:
for feature in num_features:
    df.boxplot(column = feature)
    plt.ylabel('Value')
    plt.show()

In [None]:
extreme = df['R'].median() + 0.5 * df['R'].std()

print('values to replace: {}%'.format(len(df.loc[df['R'] > extreme]) * 100 / len(df)))

In [None]:
df['R'] = np.where(df['R'] > extreme, extreme, df['R'])

### vs Target

In [None]:
for feature in num_features:
    plt.scatter(df[feature], df[target])
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.show()

### Inferences

1. If temperature is less than 5000 then star will definitely belong to Type 0 or Type 1: 
  
2. If L value = 0 then star will definitely belong to Type 0, Type 1 or Type 2
        
3.  - If R value = 0 then star will definitely belong to Type 0, Type 1 or Type 2
    - If R value > 250 then star will definitely belong to Type 5
  
4.  - If A_M value > 15 then star will definitely belong to Type 0
    - If A_M value >= 10 or A_M value <= 15 then star will definitely belong to Type 1 or Type 2
    - If A_M value > -5 or A_M value < 10 then star will definitely belong to Type 3

### Transformation

In [None]:
for feature in num_features[:-1]:
    df[feature] = np.log(df[feature])
    
    iqr = stats.iqr(df[feature], interpolation = 'midpoint')
    h = (2 * iqr) / (len(df[feature]) ** (1/3))
    bins = round((max(df[feature]) - min(df[feature])) / h)
    
    df[feature].hist(bins = bins)
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.show()

## Categorical Features

In [None]:
cat_features = [feature for feature in df.columns if feature not in num_features and feature != target]

df[cat_features].head()

### Distribution

In [None]:
for feature in cat_features:
    print('{}: {} categories'.format(feature, len(df[feature].unique())))
    print(df[feature].unique())

There are several repetitions in the Color column which we can fix

In [None]:
idx = df.loc[(df['Color'] == 'Blue white') | (df['Color'] == 'Blue-white') | (df['Color'] == 'Blue-White')].index
df.loc[idx, 'Color'] = 'Blue White'

idx = df.loc[df['Color'] == 'white'].index
df.loc[idx, 'Color'] = 'White'

idx = df.loc[(df['Color'] == 'yellowish') | (df['Color'] == 'Yellowish')].index
df.loc[idx, 'Color'] = 'Yellow'

idx = df.loc[df['Color'] == 'Yellowish White'].index
df.loc[idx, 'Color'] = 'White-Yellow'

In [None]:
for feature in cat_features:
    df.groupby(feature)[target].count().plot.bar()
    plt.ylabel('count')
    plt.show()

### vs Target Variable

In [None]:
for feature in cat_features:
    df.groupby([feature, target])['L'].count().plot.bar()
    plt.ylabel('count')
    plt.show()

Several Colors correspond to a single category which can be grouped together as they denote a fixed type of star

In [None]:
idx = df.loc[(df['Color'] == 'yellow-white') | (df['Color'] == 'Yellow') | (df['Color'] == 'Whitish') | (df['Color'] == 'Orange-Red')].index
df.loc[idx, 'Color'] = 'cat_3_only'

idx = df.loc[(df['Color'] == 'Pale yellow orange') | (df['Color'] == 'White-Yellow')].index
df.loc[idx, 'Color'] = 'cat_2_only'

In [None]:
for feature in cat_features:
    print('{}: {} categories'.format(feature, len(df[feature].unique())))
    print(df[feature].unique())

In [None]:
for feature in cat_features:
    df.groupby([feature, target])['L'].count().plot.bar()
    plt.ylabel('count')
    plt.show()

### Inferences

1. If a star has the color Pale yellow orange or White-Yellow it will definitely belong to Type 2
2. If a star has th color yellow-white, Yellow, Whitish or Orange-Red it will definitely belong to Type 3
3. If a star has the color Orange it will definitely belong to Type 5
4. If a star has the spectral type 'G' then it will definitely belong to Type 5:

# Feature Engineering

In [None]:
df = pd.read_csv('/kaggle/input/star-type-classification/Stars.csv')

print(df.shape)
df.head()

## Numerical Features

In [None]:
extreme = df['R'].median() + 0.5 * df['R'].std()

df['R'] = np.where(df['R'] > extreme, extreme, df['R'])

In [None]:
for feature in num_features[:-1]:
    df[feature] = np.log(df[feature])

## Categorical Features

In [None]:
idx = df.loc[(df['Color'] == 'Blue white') | (df['Color'] == 'Blue-white') | (df['Color'] == 'Blue-White')].index
df.loc[idx, 'Color'] = 'Blue White'

idx = df.loc[df['Color'] == 'white'].index
df.loc[idx, 'Color'] = 'White'

idx = df.loc[(df['Color'] == 'yellowish') | (df['Color'] == 'Yellowish')].index
df.loc[idx, 'Color'] = 'Yellow'

idx = df.loc[df['Color'] == 'Yellowish White'].index
df.loc[idx, 'Color'] = 'White-Yellow'

In [None]:
idx = df.loc[(df['Color'] == 'yellow-white') | (df['Color'] == 'Yellow') | (df['Color'] == 'Whitish') | (df['Color'] == 'Orange-Red')].index
df.loc[idx, 'Color'] = 'cat_3_only'

idx = df.loc[(df['Color'] == 'Pale yellow orange') | (df['Color'] == 'White-Yellow')].index
df.loc[idx, 'Color'] = 'cat_2_only'

In [None]:
dummy_df = pd.get_dummies(df, drop_first = True)
dummy_df.head()

# Feature Selection

In [None]:
X = dummy_df.drop(target, axis = 1)
y = dummy_df[target]

In [None]:
cor = X[num_features].corr()

sns.heatmap(cor, annot = True, cmap = plt.cm.CMRmap_r)
plt.show()

In [None]:
X.drop(['R'], axis = 1, inplace = True)

# Models

In [None]:
scores = {}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## RandomForest Classifier

In [None]:
model = RandomForestClassifier()

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
scores['RandomForest Classifier'] = model.score(X_test, y_test)

## XGBoost

In [None]:
model = XGBClassifier(use_label_encoder = False)

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
scores['XGBoost Classifier'] = model.score(X_test, y_test)

## Scaling

In [None]:
scaler = MinMaxScaler()

scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression

In [None]:
model = LogisticRegression()

model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
scores['Logistic Regression'] = model.score(X_test_scaled, y_test)

## KNN

In [None]:
n_score = []
n_neighbors = [3, 4, 5, 6, 7, 8, 9, 10]

for n in n_neighbors:
    model = KNeighborsClassifier(n_neighbors = n)
    model.fit(X_train_scaled, y_train)
    n_score.append(model.score(X_test_scaled, y_test))
    
best_neighbors = n_neighbors[n_score.index(max(n_score))]
print('Best Neighbors = {}'.format(best_neighbors))

In [None]:
model = KNeighborsClassifier(n_neighbors = best_neighbors)

model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
scores['KNN'] = model.score(X_test_scaled, y_test)

## SVM

In [None]:
model = SVC(kernel = 'poly')

model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
scores['SVM'] = model.score(X_test_scaled, y_test)

## Final Scores

In [None]:
for model in scores:
    print('{}: {}% accuracy'.format(model, scores[model] * 100))