In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Dataset

In [None]:
data = pd.read_csv('../input/star-type-classification/Stars.csv')

In [None]:
data.info()

In [None]:
data.columns

In [None]:
df = data.copy()

# Inferences

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
features_num = ['Temperature', 'L', 'R', 'A_M']

corr_pearson = df[features_num].corr(method='pearson')
corr_spearman = df[features_num].corr(method='spearman')

# and plot side by side
plt.figure(figsize=(15,5))
ax1 = plt.subplot(1,2,1)
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')

ax2 = plt.subplot(1,2,2, sharex=ax1)
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

In [None]:
df['Color'].value_counts()

In [None]:
df.Color.loc[df.Color=='Blue-white'] = 'Blue-White'
df.Color.loc[df.Color=='Blue White'] = 'Blue-White'
df.Color.loc[df.Color=='Blue white'] = 'Blue-White'
df.Color.loc[df.Color=='yellow-white'] = 'White-Yellow'
df.Color.loc[df.Color=='Yellowish White'] = 'White-Yellow'
df.Color.loc[df.Color=='white'] = 'White'
df.Color.loc[df.Color=='yellowish'] = 'Yellowish'
df.Color.loc[df.Color=='Whitish'] = "White"
df.Color.loc[df.Color=='Pale yellow orange'] = "Orange"
df.Color.loc[df.Color=='Orange-Red'] = "Orange"
df.Color.loc[df.Color=='Yellowish'] = "White-Yellow"

In [None]:
df.Color.value_counts()

In [None]:
for f in ['Color', 'Spectral_Class']:
    plt.figure(figsize=(10,4))
    df[f].value_counts().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

In [None]:
sns.heatmap(pd.crosstab(df.Color, df.Spectral_Class),
            cmap='RdYlGn',
            annot=True, fmt='.0f')
plt.show()

# Cleaning

In [None]:
def preprocessInputs(data):
    df = data.copy()
    
    # Color fixing
    df.Color.loc[df.Color=='Blue-white'] = 'Blue-White'
    df.Color.loc[df.Color=='Blue White'] = 'Blue-White'
    df.Color.loc[df.Color=='Blue white'] = 'Blue-White'
    df.Color.loc[df.Color=='yellow-white'] = 'White-Yellow'
    df.Color.loc[df.Color=='Yellowish White'] = 'White-Yellow'
    df.Color.loc[df.Color=='white'] = 'White'
    df.Color.loc[df.Color=='yellowish'] = 'Yellowish'
    df.Color.loc[df.Color=='Whitish'] = "White"
    df.Color.loc[df.Color=='Pale yellow orange'] = "Orange"
    df.Color.loc[df.Color=='Orange-Red'] = "Orange"
    df.Color.loc[df.Color=='Yellowish'] = "White-Yellow"
    
    # One hot Encoding 
    for column in ['Color', 'Spectral_Class']:
        dummy = pd.get_dummies(df[column])
        df = pd.concat([df, dummy], axis=1)
        df = df.drop(column, axis=1)
        
    # Spliting in X and y
    y = df['Type']
    X = df.drop('Type', axis=1)
    y = pd.get_dummies(y)
    
    
    # Spliting to training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=0)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocessInputs(data)

In [None]:
y_train

# Training Models

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
models = {
    "     Linear Regression": LinearRegression(),
    "Linear Regression (L2)": Ridge(),
    "Linear Regression (L1)": Lasso(),
    "         Decision Tree": DecisionTreeRegressor(),
    "        Neural Network": MLPRegressor(),
    "         Random Forest": RandomForestRegressor()
#     "     Gradient Boosting": GradientBoostingRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

# Results

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix as cm

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    r2 = r2_score(model.predict(X_test), y_test)
    print(name + " R^2 Score: {:.5f}".format(r2))

In [None]:
from sklearn.ensemble import RandomForestClassifier
m = RandomForestClassifier()
m.fit(X_train, y_train)
# PCM(m, X_test, y_test)
pred = np.array(m.predict(X_test))
cm(y_test.values.argmax(axis=1), pred.argmax(axis=1))

In [None]:
from sklearn.metrics import classification_report as clr
print(clr(y_test, pred))