In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import pandas_profiling as pro
import missingno as msno

In [None]:
df = pd.read_csv('/kaggle/input/star-type-classification/Stars.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
print(msno.bar(df))

In [None]:
pro.ProfileReport(df)

In [None]:
###########################
# Data is well balanced
###########################
sns.countplot(df['Type'])

In [None]:
#######################
# Let's Split the data:

x = df.iloc[:,:-1]
y = df.iloc[:,-1:]

In [None]:
y

In [None]:
import category_encoders as ce
import pandas as pd
  
# Define catboost encoder
cbe_encoder = ce.cat_boost.CatBoostEncoder()
  
# Fit encoder and transform the features
cbe_encoder.fit_transform(x, y)
train_cbe = cbe_encoder.transform(x)

In [None]:
train_cbe

In [None]:
from sklearn.model_selection import train_test_split as tts

x_train, x_test, y_train, y_test = tts(train_cbe,y, test_size=0.5, random_state=13)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt

def evaluator(y_test, y_pred):    
    
    # Accuracy:
    print('Accuracy is: ', accuracy_score(y_test,y_pred))
    print('')
    # Classification Report:
    print('Classification Report: \n',classification_report(y_test,y_pred))

    # Area Under The Curve Score:

    lb = LabelBinarizer()
    y_test1 = lb.fit_transform(y_test)
    y_pred1 =lb.transform(y_pred)
    print('AUC_ROC Score: ',roc_auc_score(y_test1,y_pred1,average='macro'),'\n\n')

    print('Confusion Matrix: \n\n')
    plt.style.use("ggplot")
    cm = confusion_matrix(y_test,y_pred)
    plot_confusion_matrix(conf_mat = cm,figsize=(8,6),show_normed=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier as rfc

In [None]:
rf_classifier = rfc()
rf_classifier.fit(x_train, y_train)

In [None]:
pred_rf = rf_classifier.predict(x_test)

evaluator(y_test, pred_rf)

In [None]:
important_features = pd.DataFrame({'Features': x.columns, 
                                   'Importance': rf_classifier.feature_importances_})

# sort the dataframe in the descending order according to the feature importance
important_features = important_features.sort_values('Importance', ascending = False)

# create a barplot to visualize the features based on their importance
sns.barplot(x = 'Importance', y = 'Features', data = important_features)

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)

# display the plot
plt.show()

### Checking for overfitting:

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot
# define lists to collect scores
train_scores, test_scores = list(), list()
# define the tree depths to evaluate
values = [i for i in range(1, 21)]
# evaluate a decision tree for each depth
for i in values:
    # configure the model
    model = DecisionTreeClassifier(max_depth=i)
    # fit model on the training dataset
    model.fit(x_train, y_train)
    # evaluate on the train dataset
    train_yhat = model.predict(x_train)
    train_acc = accuracy_score(y_train, train_yhat)
    train_scores.append(train_acc)
    # evaluate on the test dataset
    test_yhat = model.predict(x_test)
    test_acc = accuracy_score(y_test, test_yhat)
    test_scores.append(test_acc)
    # summarize progress
    print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))
# plot of train and test scores vs tree depth
pyplot.plot(values, train_scores, '-o', label='Train')
pyplot.plot(values, test_scores, '-o', label='Test')
pyplot.legend()
pyplot.show()

In [None]:
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
k = 10
kf = KFold(n_splits=k, shuffle = True)
r_classifier = rfc()
 
acc_score = []
 
for train_index , test_index in kf.split(x):
    x_train,x_test, y_train, y_test = tts(train_cbe,y, test_size = 0.3)
     
    r_classifier.fit(x_train,y_train)
    pred_values = model.predict(x_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

##  Author: Avinash Bagul

### Note: please comment mistakes if any...

#