In [None]:
import numpy as np
import pandas as pd
import math
import plotly.graph_objs as go
import matplotlib.pyplot as plt 
import plotly.offline as py
from plotly import tools
import plotly.figure_factory as ff
import seaborn as sns
import warnings
py.init_notebook_mode(connected=True)
warnings.simplefilter('ignore', FutureWarning)

In [None]:
training_set = pd.read_csv('../input/training_set.csv')
# training_set = pd.read_csv('training_set.csv')
meta_training_set = pd.read_csv("../input/training_set_metadata.csv")
# meta_training_set = pd.read_csv("training_set_metadata.csv")
test_set_meta = pd.read_csv("../input/test_set_metadata.csv")
# test_set_meta = pd.read_csv("test_set_metadata.csv")

In [None]:
full_meta_data = pd.concat([meta_training_set,test_set_meta],sort=True)


## Meta_training data

In [None]:
print("sample of meta_training data")
meta_training_set.sample(5)

In [None]:
print("training_set_metadata info")
meta_training_set.info()

 The columns in training_set metadata  represents:

- **object_id:** unique object identifier.
- **ra:** right ascension, sky coordinate: co-longitude in degrees. 
- **decl:** declination, sky coordinate: co-latitude in degrees. 
- **gal_l:** galactic longitude in degrees. 
- **gal_b:** galactic latitude in degrees. 
- **ddf:** A flag to identify the object as coming from the DDF survey area (with value DDF = 1 for the DDF, DDF = 0 for the WFD survey).Boolean
- **hostgal_specz:** the spectroscopic redshift of the source. This is an extremely accurate measure of redshift, available for the training set and a small fraction of the test set. 
- **hostgal_photoz:** The photometric redshift of the host galaxy of the astronomical source. While this is meant to be a proxy for hostgal_specz, there can be large differences between the two and should be regarded as a far less accurate version of hostgal_specz. 
- **hostgal_photoz_err:** The uncertainty on the hostgal_photoz based on LSST survey projections. 
- **distmod:** The distance to the source calculated from hostgal_photoz by using General Relativity, and assumed values of the dark
energy and dark matter content of the Universe .
- **MWEBV:** MW E(B-V). this ‘extinction’ of light is a property of the Milky Way (MW) dust along the line of sight to the astronomical source, and is thus a function of the sky coordinates of the source ra, decl. This is used to determine a passband dependent dimming and redenning of light from astronomical sources as described in subsection 2.1, and based on the Schlafly et al. (2011) and Schlegel et al. (1998) dust models. 
- **target:** The class of the astronomical source. This is provided in the training data. Correctly determining the target (correctly assigning classification probabilities to the objects) is the ‘goal’ of the classification challenge for the test data. Note that there is one class in the test set that does not occur in the training set: class_99 serves as an "other" class for objects that don't belong in any of the 14 classes in the training set. 


## Training data

In [None]:
print ("sample of training data")
training_set.sample(5)

In [None]:
print("training_set info")
training_set.info()

 The columns in training_set represents:
- **object id:** Same key as in the metadata given as int32 numbers.
- **mjd:** the time in Modified Julian Date (MJD) of the observation.  The MJD can
be converted to Unix epoch time with the formula unix time = (MJD40587)86400.
The units are days, and the numbers are given as float64 numbers.
- **passband:** The specific LSST passband integer, such that u, g, r, i, z, y = 0, 1, 2, 3, 4, 5
in which it was viewed. These are given as int8 numbers.
- **flux:** the measured flux (brightness) in the passband of observation as listed in the
passband column.
- **flux err:** the uncertainty on the measurement of the flux listed above. 
- **detected:** If detected= 1, the object’s brightness is significantly different at the 3σ
level relative to the reference template. This is given as a Boolean flag.

## NAN count

In [None]:
fig,ax = plt.subplots(1,2,figsize=(12,5))
ax[0].barh(meta_training_set.drop(['target'], axis=1).isnull().sum().index,meta_training_set.drop(['target'], axis=1).isnull().sum().values)
ax[0].set_xlabel("NAN count")
ax[0].set_title("NAN count per feature\n(traing_meta data)")
ax[1].barh(test_set_meta.isnull().sum().index,test_set_meta.isnull().sum().values)
ax[1].set_xlabel("NAN count")
ax[1].set_title("NAN count per feature\n(test_meata data)")

## Classes analysis

The classes are the classification of objects accotrding to its light curves. The challenge is to analyze each set of light curves and determine a probability that each object belongs to each of these classes. So, the classes represent the target in the challange.

In [None]:
targets_classes = meta_training_set.target.unique()
print ("There are {} unique classes.".format(len(targets_classes)))
for i in range(len(targets_classes)):
    print("class_{}".format(targets_classes[i]))

In [None]:
objects_per_target = pd.DataFrame(meta_training_set.groupby("target", as_index = False)["object_id"].count()).sort_values(by="object_id",ascending=False)
objects_per_target['target_list']=list(map(lambda x: "class_{}".format(x),objects_per_target.target))
objects_per_target = objects_per_target.rename(columns = {"object_id": "objects_count"})


In [None]:
pie_plot= go.Pie(values = objects_per_target['objects_count'],
                 labels = objects_per_target['target_list'],
                 hoverinfo="label+percent",
                 hole= .3)

layout = go.Layout(title = "Classes distribution ")

fig = go.Figure(data=[pie_plot], layout=layout)

py.iplot(fig)

In [None]:
bar_trace  = go.Bar(x= objects_per_target.target_list,
                 y= objects_per_target.objects_count,
                 marker=dict(color='#f0000a',
                             line=dict(color='rgb(8,48,107)',width=1.5,)),
                 name = "objects count",
                 opacity=0.7,
                 hoverinfo="name + y")

layout = go.Layout(title='Count of objects per class',
                   xaxis=dict(tickangle=-45),
                   yaxis = dict(title = " Number of objects"))

fig = go.Figure(data=[bar_trace], layout=layout)

py.iplot(fig)

### Classes distrbution in the space

In [None]:
colors = ['blue','gray','red','green','pink',
          'steelblue','yellow','magenta','brown',
          'orange','tan','seagreen','mintcream',
          'yellowgreen','chocolate','rosybrown',
          'dodgerblue','heather']
for i in range (0,len(objects_per_target.target)):
    class_ = meta_training_set[meta_training_set.target == objects_per_target.target.values[i]]
    trace=go.Scatter(
        x=class_['gal_l'],
        y=class_['gal_b'],
        mode = 'markers',
        marker=dict(color=colors[i]),
        text= "Longitude = {} °".format(class_['gal_l'].values[i])+"<br>"+ "Latitude = {} °".format(class_['gal_b'].values[i]),
        hoverinfo="text",
        connectgaps=True,
        name = objects_per_target.target_list.values[i],
        textfont=dict(family='Arial', size=12),
    )
    layout = go.Layout(
        title = objects_per_target.target_list.values[i]+" distrbution in space",
       xaxis=dict(
                showline=True,
                showgrid=True,
                showticklabels=True,
                linecolor='rgb(150, 150, 150)',
                linewidth=2,
                gridcolor='rgb(90, 90, 90)',
                ticks='outside',
                tickcolor='rgb(80, 80, 80)',
                tickwidth=2,
                ticklen=5,
                tickfont=dict(
                family='Arial',
                size=13,
                color='rgb(180, 180, 180)',
            ),
        ),
        yaxis=dict(
                showgrid=True,
                zeroline=True,
                showline=False,
                gridcolor='rgb(80, 80, 80)',
                showticklabels=True,
                tickcolor='rgb(150, 150, 150)',
                tickwidth=2,
                ticklen=5,
                tickfont=dict(
                family='Arial',
                size=13,
                color='rgb(180, 180, 180)')
        ),
       font=dict(family='Arial', size=12,
                color='rgb(180, 180, 180)'),
                showlegend=True, 
                width = 600,
                height = 300,
                paper_bgcolor='rgba(0, 0, 0,.9)',
                plot_bgcolor='rgba(0, 0, 0,0)')
    
    fig = go.Figure(data=[trace], layout= layout)
    py.iplot(fig)
    

In [None]:
traces = []
for i in range (len(targets_classes)):
    class_ = meta_training_set[meta_training_set.target == targets_classes[i]]
    traces.append(go.Scatter(
        x=class_['gal_l'],
        y=class_['gal_b'],
        mode = 'markers',
        marker=dict(color=colors[i]),
        text= "class_{}".format(targets_classes[i])+"<br>"+"Longitude = {} °".format(class_['gal_l'].values[i])+"<br>"+ "Latitude = {} °".format(class_['gal_b'].values[i]),
        hoverinfo="text",
        connectgaps=True,
        name = "class_{}".format(targets_classes[i]),
        textfont=dict(family='Arial', size=12),
    ))
layout = go.Layout(
    title = "Classes distrbution in space",
   xaxis=dict(
            title = "Galactical Longitude (°)",
            showline=True,
            showgrid=True,
            showticklabels=True,
            linecolor='rgb(150, 150, 150)',
            linewidth=2,
            gridcolor='rgb(90, 90, 90)',
            ticks='outside',
            tickcolor='rgb(80, 80, 80)',
            tickwidth=2,
            ticklen=5,
            tickfont=dict(
            color='rgb(180, 180, 180)',
        ),
    ),
    yaxis=dict(
            title = "Galactical Latitude (°)",
            showgrid=True,
            zeroline=True,
            showline=False,
            gridcolor='rgb(80, 80, 80)',
            showticklabels=True,
            tickcolor='rgb(150, 150, 150)',
            tickwidth=2,
            ticklen=5,
            tickfont=dict(
            color='rgb(180, 180, 180)')
    ),
   font=dict(family='Arial', size=12,
            color='rgb(200, 200, 200)'),
            showlegend=True, 
            width = 750,
            height = 550,
            paper_bgcolor='rgba(0, 0, 0,.9)',
            plot_bgcolor='rgba(0, 0, 0,0)'
)
fig = go.Figure(data=traces, layout= layout)
py.iplot(fig)

for trace in traces :
    trace.marker.opacity = 0.1
layout.title = "Classes distrbution in space (low opacity) "
fig = go.Figure(data=traces, layout= layout)
py.iplot(fig)

The highlighted spots in the low-opacity figure show the most congested areas in the sky. this is insbired by this  [kernel](https://www.kaggle.com/hrmello/dataset-overview-exploration-and-comments)

### Correlation between classes and features

In [None]:
# the galactic feature determines whether the object inside the milky way galactic==1 or outside milky way galactic==0
meta_training_set['galactic'] = list(map(lambda x: 1 if x==0 else 0,meta_training_set['hostgal_photoz']))
def is_class (class_number):
    meta_training_set['_{}'.format(class_number)]=list(map(lambda x: 1 if x==class_number else 0,meta_training_set.target))
for class_num in objects_per_target.target:
    is_class(class_num)
corr_meta = meta_training_set.drop(['object_id','target'],axis=1).corr()
plt.subplots(figsize=(12,10))
sns.heatmap(corr_meta[11:].drop(corr_meta.columns[11:],axis=1),annot=True)


 ### Compute classes weights
 
According to sklearn documentation,  class weights will be given by  **n_samples / (n_classes * np.bincount(y))** . 

***y*** is array of original class labels per sample.

In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 objects_per_target.target,
                                                 meta_training_set['target'])
print("Classes weights:"+"\n\n",list(zip(objects_per_target.target,class_weights)))

In [None]:
bar_trace  = go.Bar(x= objects_per_target.target_list,
                 y= class_weights,
                 marker=dict(color='#f0000a',
                             line=dict(color='rgb(8,48,107)',width=1.5,)),
                 name = "class weight",
                 opacity=0.7,
                 hoverinfo="name + y")

layout = go.Layout(title='Classes weights',
                   xaxis=dict(tickangle=-45),
                   yaxis = dict(title = " class weight "))

fig = go.Figure(data=[bar_trace], layout=layout)

py.iplot(fig)

## Distmod and redshift

In [None]:
group_labels = ['distmod distribution']
dist_plot = ff.create_distplot([meta_training_set['distmod'].dropna().values],
                               group_labels ,
                               bin_size=.3,
                               colors = ['rgba(5,20,100,.8)'])
dist_plot['layout'].update(title="distmod distribution plot",
                           width = 700,
                           height = 450)

py.iplot(dist_plot)


## Galactic vs Extragalactic 

As described in the data note, The given redshift for objects in our own Milky Way galaxy is given as zero. According to this, we can classify the objects to Galactic (meaning that the object inside Milky Way galaxy). Extragalactic (meaning that the object outside Milky Way galaxy).

In [None]:
full_meta_data['galactic'] = list(map(lambda x: 1 if x==0 else 0,full_meta_data['hostgal_photoz']))
test_set_meta['galactic'] = list(map(lambda x: 1 if x==0 else 0,test_set_meta['hostgal_photoz']))

pie_plot_test= go.Pie(values = test_set_meta["galactic"].value_counts(),
                 labels = ["Extragalactic","Galactic"],
                 hoverinfo="label+percent",
                 domain = dict(x=[0, .5],
                               y=[0.5,1]),
                 hole = 0.4
                    )

pie_plot_train= go.Pie(values = meta_training_set["galactic"].value_counts(),
                 labels = ["Extragalactic","Galactic"],
                 hoverinfo="label+percent",
                 domain = dict(x=[0.5, 1],
                               y=[0.5,1]),
                 hole = 0.4)

pie_plot_all= go.Pie(values = full_meta_data["galactic"].value_counts(),
                 labels = ["Extragalactic","Galactic"],
                 hoverinfo="label+percent",
                 domain = dict(x=[0, 1],
                               y=[0,0.5]),
                 hole = 0.4)

layout = go.Layout(title = "Galactic vs Extragalactic",
                  annotations = [dict(text = "test",
                                      font = dict(size=15),
                                      x=0.225,
                                      y=0.775,
                                      showarrow= False),
                                 dict(text = "train",
                                      font = dict(size=15),
                                      x=0.775,
                                      y=0.775,
                                     showarrow= False),
                                 dict(text = "all",
                                      font = dict(size=15),
                                      x=0.50,
                                      y=0.225,
                                      showarrow= False)]
                                    )

fig = go.Figure(data=[pie_plot_test,pie_plot_train,pie_plot_all], layout=layout)
py.iplot(fig)

The test pie for test dataset , the train pie for the train dataset and the all for the combination of two datasets. 

In [None]:
galactic_classes = meta_training_set.groupby(['galactic']).get_group(1)['target'].value_counts()
extragalactic_classes = meta_training_set.groupby(['galactic']).get_group(0)['target'].value_counts()
galactic_classes_list = list(map(lambda x: "class_{}".format(x),galactic_classes.index))
extragalactic_classes_list = list(map(lambda x: "class_{}".format(x),extragalactic_classes.index))

bar_trace_1  = go.Bar(x=galactic_classes_list,
                 y= galactic_classes.values,
                 marker=dict(color='#f0000a',
                             line=dict(color='rgb(8,48,107)',width=1.5,)),
                 name = "galactic",
                 opacity=0.7,
                 hoverinfo="name + y")
bar_trace_2  = go.Bar(x= extragalactic_classes_list,
                 y= extragalactic_classes.values,
                 marker=dict(color='#fff00a',
                             line=dict(color='rgb(8,48,107)',width=1.5,)),
                 name = "extragalactic",
                 opacity=0.7,
                 hoverinfo="name + y")

layout = go.Layout(title='Galactic vs Extragalactic per class',
                   xaxis=dict(tickangle=-45),
                   yaxis = dict(title = " Number of objects"))

fig = go.Figure(data=[bar_trace_1,bar_trace_2], layout=layout)
py.iplot(fig)

It is appeared that, There is no overlap between galactic and extragalactic classes. the galactic classes are [65, 16, 92, 6, 53] and the extragalactic classes are [90, 42, 15, 62, 88, 67, 52, 95, 64].

## MWEBV
As described in the data note, MWEBV  is an astronomical measure of how much redder an object appears compared to a Milky Way without dust. Larger MWEBV values correspond to more Milky Way dust along the line of sight to the objects, making the objects appearing redder ( move toward red). The high frequencies wavas like the Ultra-violet are distributed by small particles like the dust but low frequencies waves like red and infra-red dose not affected a lot by dust.

In [None]:
group_labels = ['MWEBV distribution']
dist_plot = ff.create_distplot([meta_training_set['mwebv'].dropna().values],
                               group_labels ,
                               bin_size=.1,
                               colors = ['rgba(5,20,100,.8)'])
dist_plot['layout'].update(title="MWEBV distribution plot",
                           width = 700,
                           height = 450)

py.iplot(dist_plot)

 ### Deep Drilling Fields (DDF) survey 
 
 Deep Drilling Fields (DDF) are small patches of the sky that will be sampled often to achieve great depth
(i.e. to be able to measure the flux from fainter objects). Objects in these DDF patches
will have light-curve points that are extremely well determined and therefore have small
errors in flux. The Wide-Fast-Deep (WFD) survey covers a larger part of the sky (almost
400 times the area of the deep fields) that will be observed less frequently (and so light-
curve points will have larger uncertainties), but will discover many more objects over the
larger area.

In [None]:

pie_plot_test= go.Pie(values = test_set_meta["ddf"].value_counts(),
                 labels = ["Outside DDF","Inside DDF"],
                 hoverinfo="label+percent",
                 domain = dict(x=[0, .5],
                               y=[0.5,1]),
                 hole = 0.4
                    )

pie_plot_train= go.Pie(values = meta_training_set["ddf"].value_counts(),
                 labels = ["Outside DDF","Inside DDF"],
                 hoverinfo="label+percent",
                 domain = dict(x=[0.5, 1],
                               y=[0.5,1]),
                 hole = 0.4)

pie_plot_all= go.Pie(values = full_meta_data["ddf"].value_counts(),
                 labels = ["Outside DDF","Inside DDF"],
                 hoverinfo="label+percent",
                 domain = dict(x=[0, 1],
                               y=[0,0.5]),
                 hole = 0.4)

layout = go.Layout(title = "Object distribution according to DDF survey",
                  annotations = [dict(text = "test",
                                      font = dict(size=15),
                                      x=0.225,
                                      y=0.775,
                                      showarrow= False),
                                 dict(text = "train",
                                      font = dict(size=15),
                                      x=0.775,
                                      y=0.775,
                                     showarrow= False),
                                 dict(text = "all",
                                      font = dict(size=15),
                                      x=0.50,
                                      y=0.225,
                                      showarrow= False)]
                                    )

fig = go.Figure(data=[pie_plot_test,pie_plot_train,pie_plot_all], layout=layout)
py.iplot(fig)

In [None]:
inside_ddf_target = meta_training_set.groupby(['ddf']).get_group(1)['target'].value_counts()
inside_ddf_target_list = list(map(lambda x: "class_{}".format(x),inside_ddf_target.index))
bar_trace  = go.Bar(x= inside_ddf_target_list,
                 y= inside_ddf_target.values,
                 marker=dict(color='#f0000a',
                             line=dict(color='rgb(8,48,107)',width=1.5,)),
                 name = "objects count",
                 opacity=0.7,
                 hoverinfo="name + y")

layout = go.Layout(title='Count of objects per class (inside DDF survey area)',
                   xaxis=dict(tickangle=-45),
                   yaxis = dict(title = " Number of objects"))

fig = go.Figure(data=[bar_trace], layout=layout)

py.iplot(fig)

## Light curves visualization 

There are six passbands denoted u, g, r, i, z, y (0,1,2,3,4,5 in the dataset) that select light within different wavelength ranges: wavelengths between 300 and 400 nanometers for the u band, between 400 and 600 nm for the g passband, between 500 and 700 nm for the r band, between 650 and 850 for the i band, between 800 and 950 nm for the z band, and between 950 and 1050 nm for the y band.

In [None]:
training_passband_0 = training_set[training_set['passband'] == 0]
training_passband_1 = training_set[training_set['passband'] == 1]
training_passband_2 = training_set[training_set['passband'] == 2]
training_passband_3 = training_set[training_set['passband'] == 3]
training_passband_4 = training_set[training_set['passband'] == 4]
training_passband_5 = training_set[training_set['passband'] == 5]

In [None]:
def plot_class_time_series (class_,training_passband):
    f, ax = plt.subplots(6,figsize=(8, 12))
    f.suptitle('class_{}'.format(class_))
    class_len = len(meta_training_set[meta_training_set['target']== class_])
    for i in range (0,6):
                       object_id = meta_training_set[meta_training_set['target'] == class_]['object_id'].values[i+np.random.randint(class_len-i)]
                       ax[i].scatter(training_passband[training_passband['object_id'] == object_id]['mjd'],
                                    training_passband[training_passband['object_id'] == object_id]['flux'])
                       ax[i].plot(training_passband[training_passband['object_id'] == object_id]['mjd'],
                                    training_passband[training_passband['object_id'] == object_id]['flux'])
                       ax[i].set_xlabel('')
                       ax[5].set_xlabel('mjd time')
    f.tight_layout()
    f.subplots_adjust(top=.95)


In [None]:
print ("sample of light curves of u passband (passband_0) ")
for i in range(0,5):
    plot_class_time_series(targets_classes[i],training_passband_0)