# **If you like the notebook please upvote it :D**

In [None]:
import pandas as pd
import numpy as np

# **Loading the dataset**

In [None]:
data = pd.read_csv("/kaggle/input/star-type-classification/Stars.csv")
data

# Data EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
for col in data.columns:
    plt.title(f'Histplot of {col}')
    sns.histplot(data[col])
    plt.show()

In [None]:
for col in data.columns.drop("Type"):
    plt.figure(figsize=(20,10))
    plt.title(f'Lineplot of Type VS {col}')
    sns.lineplot(x="Type", y=col, data=data)
    plt.show()

# Saving a copy of the dataset

In [None]:
data_copy = data.copy()

# Data Processing

**null values?**

In [None]:
data.isnull().sum()

# Encoding categorical data to numeric data

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

cat_cols = data.select_dtypes(object)

for col in cat_cols:
    data[col] = le.fit_transform(data[col])

# Binning continuous features

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
cols_to_bin = ['Temperature', 'L', 'R', 'A_M']

**first lets find the optimal number of bins using a formula**

In [None]:
optimal_bins = np.round(1 + np.log2(len(data)))
optimal_bins = int(optimal_bins)
optimal_bins

In [None]:
binner = KBinsDiscretizer(n_bins=optimal_bins, encode='ordinal')

for col in cols_to_bin:
    data[col+'_Binned'] = binner.fit_transform(data[col].values.reshape(-1,1)).astype('int64')    

# Feature correlation

In [None]:
corr = data.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True)
plt.show()

# Creating new features

In [None]:
for col in ['Temperature', 'L', 'R', 'A_M']:
    temp = data.groupby('Type')[col].agg(['mean'])
    temp.columns = ['Mean_'+col]
    
    data = pd.merge(data,temp,on='Type',how='left')

# Feature scaling

In [None]:
for col in data.columns.drop('Type'):
    data[col] = (data[col]-data[col].mean() ) / data[col].std()

# Target imbalance?

In [None]:
sns.countplot(data['Type'])

# Splitting data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X, y = data.drop('Type', axis=1), data['Type']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

# Splitting training data into training and validation sets

In [None]:
x_train_, x_val, y_train_, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=56)

# Model Selection

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score

In [None]:
def model_selection(x_train_, x_val, y_train_, y_val, model):
  model = model()
  model.fit(x_train_, y_train_)

  pred = model.predict(x_val)

  acc = accuracy_score(y_val, pred)
  error = np.sqrt(mean_squared_error(y_val, pred))
  report = classification_report(y_val, pred)
  train_score = model.score(x_train_, y_train_)
  val_score = model.score(x_val, y_val)

  print('Acc :', acc*100)
  print('\n')
  print('Error:', error)
  print('\n')
  print('Classification report:', report)
  print('\n')
  print('Train Score:', train_score*100)
  print('\n')
  print('Val Score:', val_score*100)
  print('\n')
  print('Is overfitting:', True if train_score>val_score else False)
  print('\n')
  print('Overfitting by:',train_score*100-val_score*100)

In [None]:
extratrees = model_selection(x_train_, x_val, y_train_, y_val, ExtraTreesClassifier)
extratrees

In [None]:
gradient = model_selection(x_train_, x_val, y_train_, y_val, GradientBoostingClassifier)
gradient

In [None]:
randomforest = model_selection(x_train_, x_val, y_train_, y_val, RandomForestClassifier)
randomforest

In [None]:
ada = model_selection(x_train_, x_val, y_train_, y_val, AdaBoostClassifier)
ada

In [None]:
xgb = model_selection(x_train_, x_val, y_train_, y_val, XGBClassifier)
xgb

In [None]:
lgbm = model_selection(x_train_, x_val, y_train_, y_val, LGBMClassifier)
lgbm

In [None]:
tree = model_selection(x_train_, x_val, y_train_, y_val, DecisionTreeClassifier)
tree

In [None]:
extratree = model_selection(x_train_, x_val, y_train_, y_val, ExtraTreeClassifier)
extratree

In [None]:
catboost = model_selection(x_train_, x_val, y_train_, y_val, CatBoostClassifier)
catboost

In [None]:
sgd = model_selection(x_train_, x_val, y_train_, y_val, SGDClassifier)
sgd

In [None]:
nb = model_selection(x_train_, x_val, y_train_, y_val, GaussianNB)
nb

In [None]:
svc = model_selection(x_train_, x_val, y_train_, y_val, SVC)
svc

**Most of the models gave an acc of 100% so I will choose a random model using python's random module**

In [None]:
import random

In [None]:
model_ = ['ExtraTrees', 'GradientBoosting', 'RandomForest', 'XGB', 'LGBM', 'DecisionTree', 'ExtraTree',
         'CatBoost', 'NB']

In [None]:
random.choice(model_)

**I will choose GradientBoostingClassifier**

# Model Building and Training

In [None]:
model = GradientBoostingClassifier()
model.fit(x_train, y_train)

# Predictions

In [None]:
pred = model.predict(x_test)
pred

In [None]:
sns.countplot(pred)

# Metrics Check

In [None]:
acc = accuracy_score(y_test, pred)
acc*100

In [None]:
error = np.sqrt(mean_squared_error(y_test, pred))
error

In [None]:
overfitting_by = model.score(x_train, y_train)*100 - model.score(x_test, y_test)*100
overfitting_by