In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Import the Libraries

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, auc, classification_report, roc_curve, roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb
from xgboost import XGBClassifier
import statsmodels

%matplotlib inline

# Import Data

In [None]:
df = pd.read_csv('../input/star-type-classification/Stars.csv')

# Explore the data

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
g = sns.heatmap(df[['Type', 'Temperature', 'L', 'R', 'A_M']].corr(), annot=True, cmap='coolwarm')

L, R and Temperature are highly correlated with the type of the star. Correlation of A_M with type of the star is very low. 

In [None]:
y = df.Type

#Break off test set from training data
dftrain, dftest, ytrain, ytest = train_test_split(df, y,
                                                 train_size=0.8, test_size=0.2, 
                                                 random_state=0)

In [None]:
def comparing (dftrain, variable1, variable2):
    print(dftrain[[variable1, variable2]][dftrain[variable2].isnull()==False].
         groupby([variable1], as_index=False). mean().sort_values(by=variable2, ascending=False))

    g=sns.FacetGrid(dftrain, col=variable2).map(sns.distplot,variable1)

In [None]:
def counting_values(dftrain, variable1, variable2):
    return dftrain[[variable1, variable2]][dftrain[variable2].isnull()==False].groupby([variable1], as_index=False).mean().sort_values(by=variable2, ascending=False)

### Temperature vs Type

In [None]:
plt.figure(figsize=(16,7))
comparing(dftrain,'Temperature','Type')

In [None]:
fig,axs = plt.subplots(figsize=(30,5))
sns.histplot(data=dftrain, x='Temperature').set_title('Temperature Distribution', fontdict={'fontsize':24, 'fontweight':'bold'});
sns.despine()

### Luminosity Ratio(L) vs Type

In [None]:
LRange = dftrain.groupby(['Type']).L.agg([len, min, max])
LRange

There's no pattern or clear range of L.

In [None]:
plt.figure(figsize=(20,7))
comparing(dftrain, 'L', 'Type')

In [None]:
fig,axs = plt.subplots(figsize=(30,5))
sns.histplot(data=dftrain, x='L').set_title('L Distribution', fontdict={'fontsize':24, 'fontweight':'bold'});
sns.despine()

### R (Solar Radius) vs Type

In [None]:
comparing(dftrain, 'R', 'Type')

In [None]:
fig,axs = plt.subplots(figsize=(30,5))
sns.histplot(data=dftrain, x='R').set_title('R Distribution', fontdict={'fontsize':24, 'fontweight':'bold'});
sns.despine()

### A_M (Absolute Magnitude) vs Type

In [None]:
comparing(dftrain, 'A_M', 'Type')

In [None]:
fig,axs = plt.subplots(figsize=(30,5))
sns.histplot(data=dftrain, x='A_M').set_title('A_M Distribution', fontdict={'fontsize':24, 'fontweight':'bold'});
sns.despine()

### Color vs Type

In [None]:
dftrain.Color.value_counts()

In [None]:
counting_values(dftrain, 'Color', 'Type')

In [None]:
# Categorize colors with the same name
dftrain.Color = dftrain.Color.replace(['Blue white', 'Blue-white', 'Blue-White'], 'Blue White')
dftrain.Color = dftrain.Color.replace(['Whitish', 'white', 'Yellowish White', 'White-Yellow'], 'White')
dftrain.Color = dftrain.Color.replace(['yellow-white', 'yellowish', 'Pale yellow orange', 'Orange-Red'], 'Yellowish')
dftrain.Color.unique()

In [None]:
dftrain.Color.value_counts()

In [None]:
g = pd.DataFrame(dftrain['Color'].value_counts())
plt.figure(figsize=(10,7))
sns.barplot(g['Color'], g.index, palette='Set2')
plt.title('Star Color Analysis')

### Spectral Class vs Type

In [None]:
g1 = pd.DataFrame(dftrain['Spectral_Class'].value_counts())
plt.figure(figsize=(10,7))
sns.barplot(g1['Spectral_Class'], g1.index, palette='Spectral')
plt.title('Spectral Class Analysis')

In [None]:
dftrain.set_index('Spectral_Class')

### Percent Distribution of Star Type

In [None]:
g2 = pd.DataFrame(dftrain['Type'].value_counts())
plt.figure(figsize=(10,7))
plt.pie(g2['Type'], labels=g2.index, autopct='%1.1f%%')
plt.title('Percent Distribution of the Star type')

# Model Selection

In [None]:
my_features = ['Temperature', 'L','R', 'Color', 'A_M', 'Spectral_Class']

In [None]:
df_train = dftrain[my_features].copy()
df_test = dftest[my_features].copy()

In [None]:
dftrain.drop(['Type'], axis=1, inplace=True)

# Select categorical columns
categorical_cols = [cname for cname in df_train.columns if
                   df_train[cname].nunique() <10 and
                   df_train[cname].dtype == 'object']

# Select numerical columns
numerical_cols = [cname for cname in df_train.columns if
                 df_train[cname].dtype in ['int64', 'float64']]

### Preprocessing

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

#preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
    
# Feature SAcaling
sc = StandardScaler()
    
preprocessor = ColumnTransformer(
      transformers=[
          ('num', numerical_transformer, numerical_cols),
          ('cat', categorical_transformer, categorical_cols)
      ])   


In [None]:
Results = pd.DataFrame({'Model': [], 'Accuracy Score': []})

## Logistic Regression

In [None]:
# Define a model
lrc = LogisticRegression()

# Bundle preprocessing and modeling code in a pipeline
lr = Pipeline(steps=[('preprocessor', preprocessor),
                    ('sc', StandardScaler()),
                    ('model', lrc
)])

# Preprocessing of training data, fit model
lr.fit(df_train, ytrain)

# Get predictions
predsLR = lr.predict(df_test)

print('Accuracy:', accuracy_score(ytest, predsLR))
print('CR:', classification_report(ytest, predsLR))
print('CM:', confusion_matrix(ytest, predsLR))

rest = pd.DataFrame({'Model': ['LogisticRegression'],
                    'Accuracy Score': [accuracy_score(ytest, predsLR)]})

Results = Results.append(rest)

## DecisionTree

In [None]:
# Define a model
dtc = DecisionTreeClassifier()

# Bundle preprocessing and modeling code in a pipeline
dt = Pipeline(steps=[('preprocessor', preprocessor),
                    ('sc', StandardScaler()),
                    ('model', dtc
)])

# Preprocessing of training data, fit model
dt.fit(df_train, ytrain)

# Get predictions
predsDT = dt.predict(df_test)

print('Accuracy:', accuracy_score(ytest, predsDT))
print('CR:', classification_report(ytest, predsDT))
print('CM:', confusion_matrix(ytest, predsDT))

rest = pd.DataFrame({'Model': ['DecisionTree'],
                    'Accuracy Score': [accuracy_score(ytest, predsDT)]})

Results = Results.append(rest)

## RandomForest

In [None]:
# Define a model
rfc = RandomForestClassifier(n_estimators=100, random_state=0, criterion='entropy')

# Bundle preprocessing and modeling code in a pipeline
rf = Pipeline(steps=[('preprocessor', preprocessor),
                    ('sc', StandardScaler()),
                    ('model', rfc
)])

# Preprocessing of training data, fit model
rf.fit(df_train, ytrain)

# Get predictions
predsRF = rf.predict(df_test)

print('Accuracy:', accuracy_score(ytest, predsRF))
print('CR:', classification_report(ytest, predsRF))
print('CM:', confusion_matrix(ytest, predsRF))

rest = pd.DataFrame({'Model': ['RandomForest'],
                    'Accuracy Score': [accuracy_score(ytest, predsRF)]})

Results = Results.append(rest)

## KNN

In [None]:
# Define a model
knnc = KNeighborsClassifier()

# Bundle preprocessing and modeling code in a pipeline
knn = Pipeline(steps=[('preprocessor', preprocessor),
                    ('sc', StandardScaler()),
                    ('model', knnc
)])

# Preprocessing of training data, fit model
knn.fit(df_train, ytrain)

# Get predictions
predsKNN = knn.predict(df_test)

print('Accuracy:', accuracy_score(ytest, predsKNN))
print('CR:', classification_report(ytest, predsKNN))
print('CM:', confusion_matrix(ytest, predsKNN))

rest = pd.DataFrame({'Model': ['KNeighborsClassifier'],
                    'Accuracy Score': [accuracy_score(ytest, predsKNN)]})

Results = Results.append(rest)

## SVM

In [None]:
# Define a model
sv = SVC()

# Bundle preprocessing and modeling code in a pipeline
svc = Pipeline(steps=[('preprocessor', preprocessor),
                    ('sc', StandardScaler()),
                    ('model', sv
)])

# Preprocessing of training data, fit model
svc.fit(df_train, ytrain)

# Get predictions
predssvc = svc.predict(df_test)

print('Accuracy:', accuracy_score(ytest, predssvc))
print('CR:', classification_report(ytest, predssvc))
print('CM:', confusion_matrix(ytest, predssvc))

rest = pd.DataFrame({'Model': ['SVM'],
                    'Accuracy Score': [accuracy_score(ytest, predssvc)]})

Results = Results.append(rest)

## XGB

In [None]:
# Define a model
xgbc = XGBClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 2, min_child_weight = 1, nthread=4, seed=27, subsample=0.8, colsample_bytree=0.9, max_delta_step=0,
                    objective='multi:softmax', gamma =0, reg_alpha=0.001, reg_lambda =0.5, eval_metric ='auc', random_state=0, num_class =6)


# Bundle preprocessing and modeling code in a pipeline
xgb = Pipeline(steps=[('preprocessor', preprocessor),
                    ('model', xgbc
)])

# Preprocessing of training data, fit model
xgb.fit(df_train, ytrain)

# Get predictions
predsxgb = xgb.predict(df_test)

print('Accuracy:', accuracy_score(ytest, predsxgb))
print('CR:', classification_report(ytest, predsxgb))
print('CM:', confusion_matrix(ytest, predsxgb))

rest = pd.DataFrame({'Model': ['XGB'],
                    'Accuracy Score': [accuracy_score(ytest, predsxgb)]})

Results = Results.append(rest)

# Results

In [None]:
Results

**The best models are RandomForest and XGB**

In [None]:
rfc.feature_importances_


In [None]:
dtc.feature_importances_

In [None]:
xgbc.feature_importances_