# Craters of Mars - Table of Contents
* [Basic Explorations](#1)
* [Geography (to be precise: Areography)](#2)
* [Largest Craters](#3)
* [Deepest Craters](#4)
* [Named Craters only](#5)

In [None]:
from IPython.display import display, Image
display(Image(filename='../input/mars-maps/mars_map_02.PNG'))

#### Image created with Folium and base map from Open Planetary: https://www.openplanetary.org/opm/basemaps

In [None]:
# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# statistics
from statsmodels.distributions.empirical_distribution import ECDF

In [None]:
# import data and preview
df = pd.read_csv('../input/mars-crater-study-dataset/Mars Crater info.csv')
df.head()

In [None]:
# dimensions
df.shape

#### Wow, those are a lot of craters!

<a id='1'></a>
# Basic Explorations

In [None]:
# summary stats
df.describe(include='all')

### Diameter

In [None]:
# plot diameter distribution
df.DIAM_CIRCLE_IMAGE.plot(kind='hist', bins=100)
plt.title('DIAM_CIRCLE_IMAGE')
plt.grid()
plt.show()

#### That doesn't really help, let's try a log scale analysis:

In [None]:
# logarithmic plot
np.log10(df.DIAM_CIRCLE_IMAGE).plot(kind='hist', bins=100)
plt.title('log10(DIAM_CIRCLE_IMAGE)')
plt.grid()
plt.show()

In [None]:
# empirical CDF of log10(diameter)
ecdf = ECDF(np.log10(df.DIAM_CIRCLE_IMAGE))
plt.plot(ecdf.x, ecdf.y)
plt.title('Empirical CDF')
plt.xlabel('log10(diameter)')
plt.ylabel('Empirical CDF')
plt.grid()
plt.show()

In [None]:
# log-log-plot of 1-CDF
plt.plot(ecdf.x, np.log10(1-ecdf.y))
plt.xlabel('log10(diameter)')
plt.ylabel('log10(excess probabilty)')
plt.title('log10(diameter)')
plt.grid()
plt.show()

#### We can observe that the frequency behaviour of the diameters is somewhat close to a linear one in this log-log plot (which would correspond to a power law for the original values).

### Depth

In [None]:
# plot depth distribution
df.DEPTH_RIMFLOOR_TOPOG.plot(kind='hist', bins=100)
plt.title('DEPTH_RIMFLOOR_TOPOG')
plt.grid()
plt.show()

In [None]:
# let's get more details by looking only at the positive values
df[df.DEPTH_RIMFLOOR_TOPOG>0].DEPTH_RIMFLOOR_TOPOG.plot(kind='hist', bins=100)
plt.title('DEPTH_RIMFLOOR_TOPOG - positive values only')
plt.grid()
plt.show()

### Depth vs Diameter

In [None]:
# scatter plot depth vs diameter
plt.scatter(df.DIAM_CIRCLE_IMAGE, df.DEPTH_RIMFLOOR_TOPOG, alpha=0.1)
plt.title('Depth vs Diameter')
plt.xlabel('Diameter')
plt.ylabel('Depth')
plt.grid()
plt.show()

In [None]:
# scatter plot depth vs diameter - diameter in log scale
plt.scatter(np.log10(df.DIAM_CIRCLE_IMAGE), df.DEPTH_RIMFLOOR_TOPOG, alpha=0.1)
plt.title('Depth vs log10(Diameter)')
plt.xlabel('log10(Diameter)')
plt.ylabel('Depth')
plt.grid()
plt.show()

### Categorical/integer features

In [None]:
# number of layers distribution
df.NUMBER_LAYERS.value_counts().plot(kind='bar')
plt.title('NUMBER_LAYERS')
plt.grid()
plt.show()

In [None]:
# plot morphologies
morph_list = ['MORPHOLOGY_EJECTA_1','MORPHOLOGY_EJECTA_2','MORPHOLOGY_EJECTA_3']
for m in morph_list:
    df[m].value_counts()[0:20].plot(kind='bar')
    plt.title(m + ' - Top 20')
    plt.grid()
    plt.show()

#### Again, we do not really see much, let's filter out the blanks:

In [None]:
# plot morphologies again igoring the blanks
morph_list = ['MORPHOLOGY_EJECTA_1','MORPHOLOGY_EJECTA_2','MORPHOLOGY_EJECTA_3']
for m in morph_list:
    df[m].value_counts()[1:20].plot(kind='bar')
    plt.title(m + ' - Top 20 ignoring blanks')
    plt.grid()
    plt.show()

<a id='2'></a>
# Geography (to be precise: Areography)

#### Let's plot all craters first:

In [None]:
# static plot of all craters
plt.figure(figsize=(14,8))
plt.scatter(x=df.LONGITUDE_CIRCLE_IMAGE,
            y=df.LATITUDE_CIRCLE_IMAGE,
            s=df.DIAM_CIRCLE_IMAGE,
            color='red',
            alpha=0.1)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid()
plt.title('Mars Craters')
plt.show()

In [None]:
# plot the "density" (hexbin) and the marginals distributions of lat/lon
sns.jointplot(data=df, x='LONGITUDE_CIRCLE_IMAGE',
              y='LATITUDE_CIRCLE_IMAGE', kind='hex', color='red',
              height=8)
plt.show()

#### The nice shape of the latitude distribution is driven by the fact that latitude strips close to the equator have a much larger area than latitude strips close to the poles!

#### Now let's plot only the larger craters:

In [None]:
# static plot of craters
df_large = df[df.DIAM_CIRCLE_IMAGE>50]
print(df_large.shape[0], 'craters > 50 km.')

plt.figure(figsize=(14,8))
plt.scatter(x=df_large.LONGITUDE_CIRCLE_IMAGE,
            y=df_large.LATITUDE_CIRCLE_IMAGE,
            s=df_large.DIAM_CIRCLE_IMAGE,
            edgecolors='black',
            linewidths=1,
            color='red',
            alpha=0.25)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid()
plt.title('Mars Craters > 50km diameter')
plt.show()

<a id='3'></a>
# Largest Craters

In [None]:
n_top = 20
df_top = df.sort_values(by='DIAM_CIRCLE_IMAGE', ascending=False)[0:n_top].copy()
df_top

#### Interestingly, half of the largest craters do not have a name!

In [None]:
# interactive plot of LARGEST craters - visualize depth using color
fig = px.scatter(df_top, x='LONGITUDE_CIRCLE_IMAGE', y='LATITUDE_CIRCLE_IMAGE',
                 color='DEPTH_RIMFLOOR_TOPOG', size='DIAM_CIRCLE_IMAGE', 
                 hover_name='CRATER_NAME', opacity=0.5)
fig.update_layout(title='Largest Mars Craters',
                  xaxis_title='Longitude',  
                  yaxis_title='Latitude')
fig.show()

<a id='4'></a>
# Deepest Craters

In [None]:
n_top = 20
df_top_d = df.sort_values(by='DEPTH_RIMFLOOR_TOPOG', ascending=False)[0:n_top].copy()
df_top_d

In [None]:
# interactive plot of DEEPEST craters - visualize depth using color
fig = px.scatter(df_top_d, x='LONGITUDE_CIRCLE_IMAGE', y='LATITUDE_CIRCLE_IMAGE',
                 color='DEPTH_RIMFLOOR_TOPOG', size='DIAM_CIRCLE_IMAGE', 
                 hover_name='CRATER_NAME', opacity=0.5)
fig.update_layout(title='Deepest Mars Craters',
                  xaxis_title='Longitude',  
                  yaxis_title='Latitude')
fig.show()

<a id='5'></a>
# Named Craters only

In [None]:
# for the following remove craters with blank as "name"
df_named = df[df['CRATER_NAME'] != ' ']
df_named.shape

#### Ok, we have almost 1000 NAMED craters.

In [None]:
# a few of the craters have one-letter names
df_named[df_named.CRATER_NAME.str.len() == 1]

#### "P" and "M" occur twice each, however, the parameters are different...!

In [None]:
# check frequency / uniqueness
df_named.CRATER_NAME.value_counts()

#### => Luckily, "P" and "M" are the only duplicates.

In [None]:
# static plot of NAMED craters
plt.figure(figsize=(14,8))
plt.scatter(x=df_named.LONGITUDE_CIRCLE_IMAGE,
            y=df_named.LATITUDE_CIRCLE_IMAGE,
            s=df_named.DIAM_CIRCLE_IMAGE,
            edgecolors='black',
            linewidths=1,
            color='red',
            alpha=0.5)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid()
plt.title('Named Mars Craters')
plt.show()

In [None]:
# interactive plot of NAMED craters - visualize depth using color
fig = px.scatter(df_named, x='LONGITUDE_CIRCLE_IMAGE', y='LATITUDE_CIRCLE_IMAGE',
                 color='DEPTH_RIMFLOOR_TOPOG', size='DIAM_CIRCLE_IMAGE', 
                 hover_name='CRATER_NAME', opacity=0.5)
fig.update_layout(title='Named Mars Craters',
                  xaxis_title='Longitude',
                  yaxis_title='Latitude')
fig.show()

### Revisit categorical/integer features again for the named craters only:

In [None]:
df_named.NUMBER_LAYERS.value_counts().plot(kind='bar')
plt.title('NUMBER_LAYERS - Named Craters only')
plt.grid()
plt.show()

morph_list = ['MORPHOLOGY_EJECTA_1','MORPHOLOGY_EJECTA_2','MORPHOLOGY_EJECTA_3']
for m in morph_list:
    plt.figure(figsize=(10,4))
    df_named[m].value_counts().plot(kind='bar')
    plt.title(m + ' - Named Craters only')
    plt.grid()
    plt.show()