### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read Data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/star-type-classification/Stars.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

So we dont have any null values lets check for the distribution of data

In [None]:
# numerical columns
numerical_columns = df.dtypes[df.dtypes != 'object']
numerical_columns

In [None]:
# categorical columns
categorical_columns = df.dtypes[df.dtypes == 'object']
categorical_columns

In [None]:
df['Color'].value_counts()

#### We can group all the values with less than 4 values into UNKNOWN category, this will also handle if in future a new color comes up. Also some of the colors are same such as Yellow White and yellowish white

In [None]:
df['Color'].value_counts().index

In [None]:
remap_cat_dict = {
    'Red' : 'Red', 
    'Blue' : 'Blue', 
    'Blue-white' : 'Blue-white', 
    'Blue White' : 'Blue-white', 
    'yellow-white' : 'White-Yellow', 
    'White' : 'White',
    'Blue white' : 'Blue-white', 
    'white' : 'White', 
    'Yellowish White' : 'White-Yellow', 
    'yellowish' : 'Yellow', 
    'Orange' : 'Orange',
    'Whitish' : 'White', 
    'Yellowish' : 'Yellow', 
    'Blue-White' : 'Blue-white', 
    'Pale yellow orange' : 'Orange',
    'Orange-Red' : 'Orange', 
    'White-Yellow' : 'White-Yellow' 
}

In [None]:
df.Color = df.Color.map(remap_cat_dict).astype('category')

In [None]:
df['Color'].value_counts()

In [None]:
df['Spectral_Class'].value_counts()

#### One hot encoding

In [None]:
df.shape

In [None]:
# df = pd.get_dummies(df, drop_first = True)

In [None]:
df.shape

#### Numerical Columns

In [None]:
numerical_columns

In [None]:
# def plot_qq(df,feature):
#     plt.figure(figsize = (10,6))
#     plt.subplot(1,2,1)
#     df[feature].hist()
#     plt.subplot(1,2,2)
#     stats.probplot(df[feature], dist= 'norm', plot=pylab)
#     plt.show()

In [None]:
# plot_qq(df,'Temperature')

## Correlation

In [None]:
features = ["Temperature","L","R","A_M"]

In [None]:
corrPearson = df[features].corr(method="pearson")
corrSpearman = df[features].corr(method="spearman")

In [None]:
figure = plt.figure(figsize=(10,8))
sns.heatmap(corrPearson,annot=True,cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title("PEARSON")
plt.xlabel("COLUMNS")
plt.ylabel("COLUMNS")
plt.show()

In [None]:
figure = plt.figure(figsize=(10,8))
sns.heatmap(corrSpearman,annot=True,cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title("Spearman")
plt.xlabel("COLUMNS")
plt.ylabel("COLUMNS")
plt.show()

## EDA

In [None]:
figure = plt.figure(figsize=(20,8))
sns.lineplot(x="Type",y="Temperature",data=df)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.lineplot(x="Type",y="L",data=df)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.lineplot(x="Type",y="R",data=df)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.lineplot(x="Type",y="A_M",data=df)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.barplot(x="Spectral_Class",y="Type",data=df)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.barplot(x="Color",y="Type",data=df)
plt.show()

### Observations
#### Features are not normally Distributed

#### One hot encoding

In [None]:
df = pd.get_dummies(df, drop_first = True)

In [None]:
### A_M is highly correlated with temp, L and R
### A_M feature is inversely proportional to all the above features

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(['Type'], axis = 1)

In [None]:
X.head()

In [None]:
y = df['Type']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()

In [None]:
X_train_scaled = sc.fit_transform(X_train)

In [None]:
X_test_scaled = sc.transform(X_test)

## Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
# from sklearn.metrics import mean_squared_error, r2_score

In [None]:
lj = LogisticRegression(solver="liblinear").fit(X_train_scaled,y_train)
knnc = KNeighborsClassifier().fit(X_train_scaled,y_train)
cartc = DecisionTreeClassifier(random_state=42).fit(X_train_scaled,y_train)
rfc = RandomForestClassifier(random_state=42,verbose=False).fit(X_train_scaled,y_train)
gbmc = GradientBoostingClassifier(verbose=False).fit(X_train_scaled,y_train)
lgbmc = LGBMClassifier().fit(X_train_scaled,y_train)

In [None]:
modelsc = [lj,knnc,cartc,rfc,gbmc,lgbmc]

In [None]:
for model in modelsc:
    name = model.__class__.__name__
    R2CV = cross_val_score(model,X_test,y_test,cv=3,verbose=False).mean()
    error = -cross_val_score(model,X_test,y_test,cv=3,scoring="neg_mean_squared_error",verbose=False).mean()
    print(name + ": ")
    print("-" * 10)
    print(R2CV)
    print(np.sqrt(error))
    print("-" * 30)

In [None]:
r = pd.DataFrame(columns=["MODELS","R2CV"])
for model in modelsc:
    name = model.__class__.__name__
    R2CV = cross_val_score(model,X_test,y_test,cv=10,verbose=False).mean()
    result = pd.DataFrame([[name,R2CV*100]],columns=["MODELS","R2CV"])
    r = r.append(result)
    
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="R2CV",y="MODELS",data=r,color="k")
plt.xlabel("R2CV")
plt.ylabel("MODELS")
plt.xlim(30,100)
plt.title("MODEL ACCURACY COMPARISON")
plt.show()

In [None]:
r = pd.DataFrame(columns=["MODELS","error"])
for model in modelsc:
    name = model.__class__.__name__
    error = -cross_val_score(model,X_test,y_test,cv=10,scoring="neg_mean_squared_error",verbose=False).mean()
    result = pd.DataFrame([[name,np.sqrt(error)]],columns=["MODELS","error"])
    r = r.append(result)
    
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="error",y="MODELS",data=r,color="r")
plt.xlabel("Error")
plt.ylabel("MODELS")
plt.xlim(0,2)
plt.title("MODEL ERROR COMPARISON")
plt.show()

## Conclusion 

Random Forest is the best predicted model

We could improve the model accuracy by performing more feature engineering and also making data normally distributed