In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report

In [None]:
df_ori=pd.read_csv('../input/star-dataset/6 class csv.csv')
#use the suplicate dataset to make changes 
df=df_ori.copy()
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#visualizing the data .
plt.style.use('seaborn')

#lets see hte count of stars from each type 
fig,ax=plt.subplots(figsize=(10,4))
sns.countplot(df['Star type'],ax=ax)

In [None]:
#lets see the countplot of star colors.
fig,ax=plt.subplots(figsize=(16,8))
sns.countplot(df['Star color'],ax=ax)
plt.setp(ax.get_xticklabels(),rotation =90)
plt.ylabel('count')

In [None]:
#Available styles in matplotlib.
plt.style.available

In [None]:
#lets plot distribution plots

#arrow to be used for annotating:
# arrow=dict(arrowstyle ='->,head_width=.5',lw=.5,facecolor='black')



# 1) Distribution of Temperature:
plt.style.use('grayscale')
fig,ax=plt.subplots(figsize=(16,8))
sns.distplot(df.iloc[:,0],ax=ax,hist=False)
plt.title('Distribution of Temperature')
plt.annotate('Temperature of Surface of sun',xy=(5500,.000065),size=10,xytext=(6000,.00008),
             arrowprops=dict(arrowstyle ='->,head_width=.5',lw=1,facecolor='k'),ha='center',va='top')
plt.show()


This shows that most of the stars in the dataset have a surface temperature peaked around the surface temperature of the sun.

In [None]:
# 2)Distribution of Luminosity:
fig,ax=plt.subplots(figsize=(16,8))
sns.distplot(df.iloc[:,1],ax=ax,hist=False)
plt.title('Distribution of Luminosity')
plt.annotate('Luminosity of sun',(1,.0000049),size=10,xytext=(10000,.0000055)
             ,arrowprops=dict(arrowstyle='-> ,head_width=.5',lw=1,facecolor='black'),ha='center',va='top')
plt.show()


The Relative Luminosity peak also lies near the Luminosity of the Sun although the range is very broad as there stars almost a million times more Luminous than Sun in the dataset.

In [None]:
fig,ax=plt.subplots(figsize=(16,8))
sns.distplot(df.iloc[:,2],hist=False)
plt.title('Distribution of radius')
plt.annotate('Radius of Sun = 1 R',(1,.025),size=10,xytext=(250,.02)
             ,arrowprops=dict(arrowstyle='-> ,head_width=.5',lw=1,facecolor='black'),ha='center',va='top')
plt.show()


In [None]:
#for sake of easy visualization,lets narrow down the number of unique colors.
color_dict={'Blue white':'Blue White','Whitish':'White','white':'White','Blue white ':'Blue White'
      ,'Blue-White':'Blue White','yellow-white':'Yellowish White','White-Yellow':'Yellowish White'
      ,'Orange-Red':'Orange','Pale yellow orange':'Orange','Blue-white':'Blue White','Blue ':'Blue','yellowish':'Yellowish'}
df.replace({'Star color':color_dict} ,inplace=True)

#unique color values:
df['Star color'].unique()

The *Hertzsprung–Russell* diagram, abbreviated as H–R diagram, HR diagram or HRD, is a scatter plot of stars showing the relationship between the stars' absolute magnitudes or luminosities versus their stellar classifications or effective temperatures. The diagram was created independently in around 1910 by Ejnar Hertzsprung and Henry Norris Russell, and represented a major step towards an understanding of stellar evolution. 

In [None]:
plt.style.use('dark_background')
plt.rcParams['font.size']=15
color={'Red':'r','Blue White':'skyblue','White':'w','Yellowish White':'lightyellow','Orange':'orange','Blue':'blue','Yellowish':'yellow'}
hr=sns.relplot(x='Temperature (K)',y='Absolute magnitude(Mv)',hue='Star color',size='Radius(R/Ro)',data=df,palette=color)
hr.fig.set_figwidth(16)
hr.fig.set_figheight(8)

plt.xlabel('Temperature')
plt.ylabel('Absolute Magnitude')
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
plt.grid(False)
plt.title('HR Diagram',fontsize=30)

In [None]:
#splitting the data:
y=df['Spectral Class']

#dropping color because the column 'Temperature' already carries that info.
#dropping Star type
X=df.drop(['Star color','Spectral Class','Star type'],axis=1)

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=.1,shuffle=True)

In [None]:
# Before predicting using the model,lets see the learning curve.
from sklearn.model_selection import learning_curve
import math


model=RandomForestClassifier(n_estimators=500,n_jobs=-1)

#dividing the train_sizes into 1/4,3/4,1/2 and complete dataset.

train_sizes=[math.floor(len(X)/4),math.floor(len(X)/3),math.floor(len(X)/2),len(X)]



In [None]:
#drawing learning curves


train_sizes,train_scores,valid_scores=learning_curve(
estimator=model,
X=X,y=y,
cv=3)

mean_training_scores=train_scores.mean(axis=1)
mean_valid_scores=valid_scores.mean(axis=1)
plt.style.use('seaborn')
plt.figure(figsize=(16,10))
plt.plot(train_sizes,mean_training_scores,label='training_scores')
plt.plot(train_sizes,mean_valid_scores,label='Validation scores')
plt.xlabel('training set size')
plt.ylabel('Scores')
plt.legend()

plt.title('Learning curve for Random Forest Classifier ')



In [None]:
#fitting
model.fit(x_train,y_train)

#scoring on test set
s=model.score(x_test,y_test)
print('score with {} is {}'.format(model,s))