In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sky_data = pd.read_csv("../input/sloan-digital-sky-survey/Skyserver_SQL2_27_2018 6_51_39 PM.csv")
sky_data.head()

In [None]:
sky_data.info()

In [None]:
import seaborn as sns

sub = sky_data.drop(columns=["objid", "specobjid", "run", "rerun", "camcol", "field", "plate", "mjd", "fiberid"])
print(sub["class"].unique())
cleanup_nums = {"class": {"STAR": 0, "GALAXY": 1, "QSO": 2}}
sub = sub.replace(cleanup_nums)
sub.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

temp = sub.drop(columns="class")
print(temp.columns)
scaled = pd.DataFrame(scaler.fit_transform(temp.values), columns=temp.columns)
scaled.head()

In [None]:
pd.concat((sub['class'], scaled))
sub.corr()

In [None]:
sns.heatmap(sub.corr())

In [None]:
sns.pairplot(sub, hue="class")

In [None]:
sns.boxplot(data=sky_data, y='redshift', x='class')

In [None]:
from scipy.stats import norm
import matplotlib.pyplot as plt
import numpy as np

star = sky_data[sky_data['class']=='STAR']['redshift']
galaxy = sky_data[sky_data['class']=='GALAXY']['redshift']
quasar = sky_data[sky_data['class']=='QSO']['redshift']

f, axs = plt.subplots(3,3, figsize=(22.5,15))

sns.distplot(star, fit=norm, ax=axs[0,0])
sns.distplot(np.log1p(star), fit=norm, ax=axs[0,1])
sns.distplot(np.sqrt(star), fit=norm, ax=axs[0,2])

sns.distplot(galaxy, fit=norm, ax=axs[1,0])
sns.distplot(np.log1p(galaxy), fit=norm, ax=axs[1,1])
sns.distplot(np.sqrt(galaxy), fit=norm, ax=axs[1,2])

sns.distplot(quasar, fit=norm, ax=axs[2,0])
sns.distplot(np.log1p(quasar), fit=norm, ax=axs[2,1])
sns.distplot(np.sqrt(quasar), fit=norm, ax=axs[2,2])
plt.show()

In [None]:
temp = sub.copy()
temp['redshift'] = np.log1p(temp['redshift'])
temp.info()

In [None]:
f, axs = plt.subplots(2,3, figsize=(22.5,10))
sns.boxplot(data=sky_data, y='u', x='class', ax=axs[0, 0])
sns.boxplot(data=sky_data, y='g', x='class', ax=axs[0, 1])
sns.boxplot(data=sky_data, y='r', x='class', ax=axs[0, 2])
sns.boxplot(data=sky_data, y='i', x='class', ax=axs[1, 0])
sns.boxplot(data=sky_data, y='z', x='class', ax=axs[1, 1])
sns.boxplot(data=temp, y='redshift', x='class', ax=axs[1, 2])

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb # regression model

def validation_predictions(model, name):
    new_model = model.fit(X_train, y_train)
    pred = np.exp(new_model.predict(X_valid))
    print("============= %s and Shuffle Split =============" %name)
    print("Accuracy: %f" %(r2_score(np.exp(y_valid), pred)))
    print("RMSE: %f" %(np.sqrt(mean_squared_error(np.exp(y_valid), pred))))
    print("MAE: %f" %(mean_absolute_error(np.exp(y_valid), pred)))
    
    new_model = model.fit(X, y)
    cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    cvs = cross_val_score(new_model, X, y, cv=cv)
    print('Shuffle and cross validate: %s \nAverage: %.4f' %(cvs, cvs.mean()))

In [None]:
y = temp['class']
X = temp.drop(columns=['class'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

In [None]:
lr_model = LinearRegression()
validation_predictions(lr_model, 'Linear Regression')

In [None]:
dt_model = DecisionTreeRegressor(random_state=0)
validation_predictions(dt_model, 'Decision Tree Regressor')

In [None]:
rf_model = RandomForestRegressor(random_state=0)
validation_predictions(rf_model, 'Random Forest Regressor')

In [None]:
xgb_model = xgb.XGBRegressor()
validation_predictions(xgb_model, 'XGBoost Regressor')