In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

First of all my aim is creating new variables as categorical variable and  getting a good model.

# ***Reading Data***

In [None]:
df=pd.read_csv("../input/star-type-classification/Stars.csv") #Import data

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

It is good that we don't have any null value.

In [None]:
df.describe()

As we can see some variables are highly spread.

In [None]:
for i in "Color","Spectral_Class","Type":
    print(i+":"+str(df[i].unique())+"\n") 
    

We need to adjust Color variable.

In [None]:
for i in ["Yellowish White","Blue White","Blue white","Blue-white","Whitish","yellow-white","white","Blue-White"]:
    df.loc[df["Color"]==i,"Color"]="White"

In [None]:
df["Color"].unique()

In [None]:
df.loc[df["Color"]=='Pale yellow orange',"Color"]="Orange"

In [None]:
df["Color"].unique()

In [None]:
for i in ['White-Yellow','yellowish','Yellowish']:
        df.loc[df["Color"]==i,"Color"]="Yellow"

In [None]:
df["Color"].unique()

In [None]:
df.loc[df["Color"]=='Orange-Red',"Color"]="Red"

In [None]:
df["Color"].unique()

Color adjustment is done.

In [None]:
df.duplicated().sum()

# **Visualization**

First of all I want to look again to the table to visualize easily.

In [None]:
df.head(10)

In [None]:
data=df.groupby("Type").mean()

In [None]:
sns.barplot(x=data.index,y="Temperature",data=data)
plt.title("Temperature vs Type")

I can create a new variable by using this information, I can create a temp variable which contains
low-temp and and high-temp. low-temp ones are type 1 and type 2 while others are high temperature.

In [None]:
sns.barplot(x=data.index,y="L",data=data)
plt.title("L vs Type")

For type 0,1,2 L variable is incredibly low, while for type 3 it is  moderate and for 4-5 it is high. I can create new categorical variable by looking this information.

In [None]:
sns.barplot(x=data.index,y="R",data=data)
plt.title("R vs Type")

Still I can separte 0-1-2-3-4 and 5 by using this information.

In [None]:
sns.barplot(x=data.index,y="A_M",data=data)
plt.title("A_M vs Type")

This information still leads to us a new categorical variable.

In [None]:
data2=df["Color"].value_counts()
data2.head()

In [None]:
sns.barplot(x=data2.index,y=df["Color"].value_counts())
plt.title("Numbers of stars by their colors")

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True)

Still we can see the correlations by looking this correlation map. A_M variable highly negative correlated with Type so it is important for us.

Now we need to look for outliers.

In [None]:
def outlier_graph(data,column):
    plt.figure(figsize=(5,3))
    sns.boxplot(data[column])
    plt.title("{} distribution".format(column))

In [None]:
for i in ["Temperature","L","R","A_M"]:
    outlier_graph(df,i)

There are too many outilers in R and L but remove them can lead to wrong model to us because variables already seperated in non-uniform way. Removing these outlierswill cause a lack of information so I don't touch them.

# **Creating New Variables**

In [None]:
df.head()

In [None]:
Temperature_Cat=[]
for i in df["Temperature"]:
    if i > 6000:
        Temperature_Cat.append("Temp_High")
    else:
        Temperature_Cat.append("Temp_Low")
len(Temperature_Cat)
df["Temperature_Cat"]=Temperature_Cat        

In [None]:
L_Cat=[]
for i in df["L"]:
    if i >= 100000:
        L_Cat.append("L_High")
    elif 25000<= i < 100000:
        L_Cat.append("L_Moderate")
    else:
        L_Cat.append("L_Low")        
len(L_Cat)
df["L_Cat"]=L_Cat  

In [None]:
R_Cat=[]
for i in df["R"]:
    if i > 400:
        R_Cat.append("R_High")
    else:
        R_Cat.append("R_Low")
len(R_Cat)
df["R_Cat"]=R_Cat        

In [None]:
A_M_Cat=[]
for i in df["A_M"]:
    if i > 0:
        A_M_Cat.append("A_M_High")
    else:
        A_M_Cat.append("A_M_Low")
len(A_M_Cat)
df["A_M_Cat"]=A_M_Cat        

In [None]:
df.head()

In [None]:
df.isnull().sum()

Now we created all of our variables, now we can create dummies and start modelling.

In [None]:
df=pd.get_dummies(data=df,columns=["Color","Spectral_Class","Temperature_Cat","L_Cat","R_Cat","A_M_Cat"],drop_first=True)

I deliberately dropped the first values because if other values are 0, our model will know that dropped one is 1 so it is unnecessary to put all values and putting all the values in would inflate our model.

In [None]:
df.head()

We are ready for modelling.

# **Modelling**

In [None]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler,QuantileTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score,StratifiedKFold,train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix

In [None]:
x=df.drop(["Type"],axis=1)
y=df["Type"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
print("x_train",len(x_train))
print("x_test",len(x_test))
print("y_train",len(y_train))
print("y_test",len(y_test))

# **Cross Validation & Modelling**

I will try KNN, SVC,Random Forest,Decision Tree. I won't use  Hyperparameter Tuning or Grid Search since our variables are pretty clear and I don't want to push too hard my model, I will go basic. 

In [None]:
Classifiers=[]
Scores=[]

Start with KNN

In [None]:
for i in [5,6,7,8,9,10]:
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train,y_train)
    Scores.append(cross_val_score(knn, x_test, y_test, cv=5).mean())
    Classifiers.append("Knn{}".format(str(i)))
    plt.subplots()
    sns.heatmap(confusion_matrix(y_test, knn.predict(x_test)),annot=True)
    plt.title("Knn{}".format(str(i)))

SVC

In [None]:
svc=SVC(random_state = 5)
svc.fit(x_train,y_train)
Scores.append(cross_val_score(svc, x_test, y_test, cv=5).mean())
Classifiers.append("Svc")
sns.heatmap(confusion_matrix(y_test,svc.predict(x_test)),annot=True)

Random Forest

In [None]:
for i in [30,60,80,100]:
    rf=RandomForestClassifier(n_estimators=i,random_state = 5)
    rf.fit(x_train,y_train)
    Scores.append(cross_val_score(rf, x_test, y_test, cv=5).mean())
    plt.subplots()
    sns.heatmap(confusion_matrix(y_test, rf.predict(x_test)),annot=True)
    plt.title("Rf{}".format(str(i)))
    Classifiers.append("Rf{}".format(str(i)))

Decision Tree

In [None]:
dtc=DecisionTreeClassifier(random_state = 5)
dtc.fit(x_train,y_train)
Scores.append(cross_val_score(dtc, x_test, y_test, cv=5).mean())
sns.heatmap(confusion_matrix(y_test, dtc.predict(x_test)),annot=True)
Classifiers.append("Dtc")

Visualize Scores.

In [None]:
graph_data= pd.DataFrame(list(zip(Classifiers,Scores)),columns =['Classifiers', 'Scores']) 
graph_data=graph_data.sort_values("Scores",ascending=False)
plt.figure(figsize=(16,8))
sns.barplot(x=graph_data["Classifiers"],y=graph_data["Scores"])

In [None]:
graph_data.head(20)

We can see that Random Forest and Decision Tree are the best models.We can combine them.

# **Ensemble Model**

In [None]:
Last_Model = VotingClassifier(estimators = [('dtc', DecisionTreeClassifier(random_state = 5)),
                                        ('Rf60', RandomForestClassifier(n_estimators=60,random_state = 5)),
                                        ('Rf30', RandomForestClassifier(n_estimators=30,random_state = 5))],
                                        voting = "hard", n_jobs = -1)
Last_Model = Last_Model.fit(x_train, y_train)
print(accuracy_score(Last_Model.predict(x_test),y_test))

Our accuracy seems %100 but it is because we have only 240 data, If we would have more it will decrease.

In [None]:
sns.heatmap(confusion_matrix(y_test, Last_Model.predict(x_test)),annot=True)

We have no problem all the predicts are correct but still we need bigger dataset to create better model. 100% accuracy can be misleading.