In [None]:
import pandas as pd
import numpy as np

In [None]:
url = '../input/star-type-classification/Stars.csv'
dataset = pd.read_csv(url)

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.dtypes

In [None]:
pd.isna(dataset)


In [None]:
pd.isnull(dataset)

In [None]:
import matplotlib.pyplot as plt

In [None]:
starTypes = dataset['Type']
temps = dataset['Temperature']
colors = dataset['Color']
rel_radius = dataset['R']
abs_mag = dataset['A_M']
rel_lumin = dataset['L']

In [None]:
# R = Relative Radius
# A_M = Absolute Magnitude
# L = Relative Luminosity

In [None]:
plt.bar(starTypes, temps, color = 'green')
plt.title('temps in star types')
plt.show()

In [None]:
plt.scatter(starTypes, colors, color = 'green')
plt.title('colors in star types')
plt.show()

In [None]:
plt.scatter(starTypes, rel_radius, color = 'green')
plt.title('radii in star types')
plt.show()

In [None]:
plt.scatter(starTypes, rel_lumin, color = 'green')
plt.title('luminosity in star types')
plt.show()

In [None]:
plt.scatter(starTypes, abs_mag, color = 'green')
plt.title('magnitude in star types')
plt.show()

In [None]:
from numpy.random import seed
from scipy.stats import pearsonr

In [None]:
# correlation between temps and rel radii
corr1 = pearsonr(temps, rel_radius)
corr1

In [None]:
# correlation between temp and abs_mag
corr2 = pearsonr(temps, abs_mag)
corr2

In [None]:
# correlation between temp and rel_luminosity
corr3 = pearsonr(temps, rel_lumin)
corr3

In [None]:
dataset['Color'] = dataset['Color'].astype('category')
dataset['Color_Code'] = dataset['Color'].cat.codes
dataset['Spectral_Class'] = dataset['Spectral_Class'].astype('category')
dataset['SC_Code'] = dataset['Spectral_Class'].cat.codes


In [None]:
from sklearn.model_selection import train_test_split
import random

In [None]:
random.seed(12)
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(['Type', 'Color', 'Spectral_Class'], axis = 'columns'), dataset['Type'], test_size = 0.3)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
dtree1 = DecisionTreeClassifier(random_state=0)

In [None]:
dtree1.fit(X_train, y_train)

In [None]:
prediction1 = dtree1.predict(X_test)

In [None]:
classification_report(y_test, prediction1)

In [None]:
accuracy_score(y_test, prediction1)

In [None]:
confusion_matrix(y_test, prediction1)

In [None]:
comparison = pd.DataFrame(y_test)
comparison['predicted'] = prediction1
comparison

In [None]:
rf1 = RandomForestClassifier(random_state=0, oob_score=True, n_estimators = 100)

In [None]:
rf1.fit(X_train, y_train)

In [None]:
prediction2 = rf1.predict(X_test)

In [None]:
classification_report(y_test, prediction2)

In [None]:
accuracy_score(y_test, prediction2)

In [None]:
confusion_matrix(y_test, prediction2)

In [None]:
plt.scatter(y_test, prediction2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Random Forest Classifier Result')
plt.show()

In [None]:
plt.scatter(y_test, prediction1)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Decision Tree Classifier Result')
plt.show()