# SpaceNet 7 Dataset EDA and Train/Valid split

After trying random splits and trying to leave minimal test and validation parts it became necessary to control distribution of the validation part more deep and get a closer look to change pixel statistics.

## Import

In [None]:
import pandas as pd 
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from tqdm.notebook import tqdm
tqdm.pandas()
from skimage import io

In [None]:
csv_file = Path('../input/calculating-number-of-pixels-for-change-in-masks/annotations.csv')
root_dir = Path('../input/spacenet-7-change-detection-chips-and-masks/chip_dataset/chip_dataset/change_detection')

## Reading Annotations

In [None]:
df = pd.read_csv(csv_file)

In [None]:
df.head()

In [None]:
df['target'].value_counts()

## Grouping by AOIs
In order to choose which locations we need to split out for valid part, let's group our annotations by them.

In [None]:
groups = df.groupby('im_name')

### Extracting Location Statistics
Below we loop over all the groups, and extract some important statistics. These statistics include:

* Percentage Change
* Pixel Medians
* Pixel Means
* Pixel Standard Deviations

In [None]:
def get_stats_df(groups):
    stats_dict = {
        'location_names' : [],
        'percent_change' : [],
        'pxl_median' : [],
        'pxl_mean' : [],
        'pxl_std' : [],
    }

    for group in groups:
        name = group[0]
        n_change = group[1]['is_blank'].isna().sum()
        n_total = len(group[1])
        percent_change = n_change/n_total*100
        # the median number of pixels that change when there is change
        pixel_median = group[1][group[1]['is_blank']!='blank']['n_change_pix'].median()
        pixel_mean = group[1][group[1]['is_blank']!='blank']['n_change_pix'].mean()
        pixel_std = group[1][group[1]['is_blank']!='blank']['n_change_pix'].std()

        stats_dict['location_names'].append(name)
        stats_dict['percent_change'].append(int(percent_change))
        stats_dict['pxl_median'].append(int(pixel_median))
        stats_dict['pxl_mean'].append(int(pixel_mean))
        stats_dict['pxl_std'].append(int(pixel_std))
    
    stats_df = pd.DataFrame(stats_dict)
    
    return stats_df

In [None]:
stats_df = get_stats_df(groups)

In [None]:
stats_df.head()

In [None]:
stats_df.describe()

### Analysis
As we see, some of our locations can only have up to 1 percent of the chips containing locations and a median pixel change of 4, that means that most of the chips barely contain any change, it may be just an artifacts of change mask preparation.

Let's create a plot and try and visualize the statistics for each location.

## Visualizations

In [None]:
def plot_mean_and_std(stats_df):
    fig = go.Figure()

    fig.add_trace(go.Bar(x=stats_df['location_names'],
                    y=stats_df['pxl_mean'],
                    name='Mean',
                    marker_color='rgb(122,81,149)'
                    ))

    fig.add_trace(go.Bar(x=stats_df['location_names'],
                    y=stats_df['pxl_std'],
                    name='Standard Deviation',
                    marker_color='rgb(239,86,117)'
                    ))

    fig.update_layout(
        title='Pixel Statistics per Satellite Image Location',
        xaxis=dict(
            title='Satellite Images Location ID',
            titlefont_size=16,
            tickfont_size=14,
        ),
        yaxis_tickfont_size=14,

        legend=dict(
            x=0,
            y=1.0,
            bgcolor='rgba(255, 255, 255, 0)',
            bordercolor='rgba(255, 255, 255, 0)'
        ),

        barmode='group',
        bargap=0.2, # gap between bars of adjacent location coordinates.
        bargroupgap=0.01 # gap between bars of the same location coordinate.
    )
    fig.show()

In [None]:
plot_mean_and_std(stats_df)

We can tell from our plots that our images have a high variance when it comes to the distribution of number of pixels that change per chip. Let's have a look at our medians instead so we can get a clearer idea of what's going on.

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x=stats_df['location_names'],
                y=stats_df['pxl_median'],
                name='Median',
                marker_color='rgb(122,81,149)'
                ))

fig.update_layout(
    title='Median Number of Change Pixels per Satellite Image Location',
    xaxis=dict(
        title='Satellite Images Location ID',
        titlefont_size=16,
        tickfont_size=14,
    ),
    yaxis_tickfont_size=14,
    
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    
    barmode='group',
    bargap=0.2, # gap between bars of adjacent location coordinates.
    bargroupgap=0.01 # gap between bars of the same location coordinate.
)
fig.show()

Let's create a helper function below to help us visualize how the number of images containing change relates to the median number of pixels that change within that location.

In [None]:
def plot_median_pixel_change_and_percentage_change(labels,percent_changes,pixel_medians,title=None):
    fig = go.Figure()
    if title is None:
        title = 'Median Number of Change Pixels and Percentage Change Per Location Combined'
    else:
        title = 'Median Number of Change Pixels and Percentage Change Per Location Combined ' + title
        
    fig.add_trace(go.Bar(x=labels,
                    y=percent_changes,
                    name='Percentage',
                    marker_color='rgb(255,166,0)'
                    ))

    fig.add_trace(go.Bar(x=labels,
                    y=pixel_medians,
                    name='Median',
                    marker_color='rgb(122,81,149)'
                    ))

    fig.update_layout(
        title=title,
        xaxis=dict(
            title='Satellite Images Location ID',
            titlefont_size=16,
            tickfont_size=14,
        ),
        yaxis_tickfont_size=14,

        legend=dict(
            x=0,
            y=1.0,
            bgcolor='rgba(255, 255, 255, 0)',
            bordercolor='rgba(255, 255, 255, 0)'
        ),

        barmode='group',
        bargap=0.2, # gap between bars of adjacent location coordinates.
        bargroupgap=0.01 # gap between bars of the same location coordinate.
    )
    fig.show()

In [None]:
plot_median_pixel_change_and_percentage_change(stats_df['location_names'],stats_df['percent_change'],stats_df['pxl_median'])

Below we create a helper function that allows us to filter our dataframe based on the median and the number of images containing the change. 

In [None]:
def filter_by_median_and_percentage_change(df,median_thresh=10,percentage_thresh=10):
    _filter = (df['pxl_median'] > median_thresh) & (df['percent_change'] > percentage_thresh)
    df_filtered = df[_filter].reset_index(drop=True)
    return df_filtered

In [None]:
filtered_df = filter_by_median_and_percentage_change(stats_df)

### Comparing Original_df and Filtered_df stats

In [None]:
stats_df.describe()

In [None]:
filtered_df.describe()

In [None]:
filtered_df.head()

In [None]:
plot_median_pixel_change_and_percentage_change(filtered_df['location_names'],filtered_df['percent_change'],filtered_df['pxl_median'])

Now we may categorize our original dataset into following groups by interquartile ranges by our pixel statistics:

* High Mean, High Percentage Change
* High Mean, Low Percentage Change
* Low Mean, High Percentage Change
* Low Mean, Low Percentage Change
* Mid Mean, Mid Percentage Change

we add 'type' column to mention category for each region

In [None]:
def categorize_df(df,hm=28,hp=35,lm=14,lp=14):
    '''
    df: DataFrame
    hm: High Median Threshold
    hp: High Change Percentage Threshold
    lm: Low Median Threshold
    lp: Low Change Percentage Threshold
    '''
    median = df['pxl_median']
    percentage = df['percent_change']
    
    if (median > hm) and (percentage > hp):
        df['type'] = 'hmhp'
    elif (median > hm) and (percentage < lp):
        df['type'] = 'hmlp'
    elif (median < lm) and (percentage < lp):
        df['type'] = 'lmlp'
    elif (median < lm) and (percentage > hp):
        df['type'] = 'lmhp'
    elif (lm < median < hm) and (lp < percentage < hp):
        df['type'] = 'mmmp'
    else:
        df['type'] = 'other'
    
    return df

In [None]:
stats_df.head()

In [None]:
stats_df = stats_df.progress_apply(lambda x: categorize_df(x),axis=1)

In [None]:
other_df = stats_df[stats_df['type'] == 'other']
hmlp_df = stats_df[stats_df['type'] == 'hmlp']
lmlp_df = stats_df[stats_df['type'] == 'lmlp']
mmmp_df = stats_df[stats_df['type'] == 'mmmp']
hmhp = stats_df[stats_df['type'] == 'hmhp']

In [None]:
plot_median_pixel_change_and_percentage_change(other_df.index.astype(str),other_df['percent_change'],other_df['pxl_median'], title='(Other)')

In [None]:
plot_median_pixel_change_and_percentage_change(hmlp_df.index.astype(str),hmlp_df['percent_change'],hmlp_df['pxl_median'],title='(High Median, Low Percentage)')

In [None]:
plot_median_pixel_change_and_percentage_change(lmlp_df.index.astype(str),lmlp_df['percent_change'],lmlp_df['pxl_median'],title='(Low Median, Low Percentage)')

In [None]:
plot_median_pixel_change_and_percentage_change(hmhp.index.astype(str),hmhp['percent_change'],hmhp['pxl_median'],title='(High Median, High Percentage)')

In [None]:
plot_median_pixel_change_and_percentage_change(mmmp_df.index.astype(str),mmmp_df['percent_change'],mmmp_df['pxl_median'], title='(Mid Median, Mid Percentage)')

## Grouping our Dataset by Date
Now lets group our dataset by the month in which the image was taken in. The number of pixels containing change in our masks varies with the number of months apart that the satellite images were captured in. 

Let's simplify things by starting with isolating the part of the dataframe that we are only interested in, by extracting the necessary columns.

In [None]:
# df only containing images with change in them
df_change = df[df['is_blank'] != 'blank'].reset_index(drop=True)
# extracting on necessary columns
df_pix = df_change.loc[:,['im_name','im_dates','period_1','period_2','n_change_pix','month_diff']]

In [None]:
df_pix['n_change_pix'].describe()

## Visualizing Number of Pixels Containing Change
### BoxPlot of Number of Pixels Containing Change
Let's visualize the Number of by plotting a box plot. This box plot will have a threshold of 180 pixels. This means that any images containing more more than 180 pixels of change will not be part of the summary statistics. The reason for this is that I am considering any number greater than 180 as an outlier.

In [None]:
fig = px.box(df_change[df_change['n_change_pix'] < 180], y="n_change_pix")

fig.update_layout(
    title='Distribution of Number of Pixels Containing Change\n Threshold: 180',
    xaxis_title="",
    yaxis_title="Number of Pixels Containing Change",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)

fig.show()

### Distribution of Number Of Pixels Containing Change

In [None]:
fig = px.histogram(df_change[df_change['n_change_pix'] < 180], x="n_change_pix", nbins=100)

fig.update_layout(
    title='Distribution of Number of Pixels Containing Change\n Threshold: 180',
    xaxis_title="Number of Pixels Containing Change",
    yaxis_title="Count",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)

fig.show()

### Number of Images Containing Change vs Difference in Number of Months
Let's visualize the distribution of the number of images containing change to the number of months between the 2 images.

In [None]:
df_pix_months = df_pix.groupby('month_diff')

In [None]:
month = []
month_image_counts = []
for gp in df_pix_months:
    month.append(gp[0])
    month_image_counts.append(len(gp[1]))

In [None]:
fig = px.bar(x=month, y=month_image_counts)

fig.update_layout(
    title='Number of Images Containing Change vs Difference in Number of Months',
    xaxis_title="Difference in Number of Months",
    yaxis_title="Total Number of Images Containing Change",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)

fig.show()

### Number of Pixels Containing Change vs Difference in Number of Months
In the previous plot we visualized how the `number of images` containing change varies with the number of months between the 2 images. In plot below we ask ourselves: Do the number of months between the 2 images affect the `number of pixels` containing change? 

It would make sense, that as the number of months increases between the 2 images, then the number of change pixels, will most likely increase as well.

Let's have a look at the distribution below with various thresholds; first the entire dataset, second at 850 pixels threshold, third at 150 pixels threshold.

In [None]:
def plot_monthly_change_statistics(df_pix,threshold=None):
    if threshold is not None:
        df_pix_filtered = df_pix[df_pix['n_change_pix'] < threshold].reset_index(drop=True)
        title = "Number of Pixels Containing Change vs Difference in Number of Months \nThreshold: " + str(threshold)
    else:
        df_pix_filtered = df_pix
        title = "Number of Pixels Containing Change vs Difference in Number of Months"
        
    df_pix_months = df_pix_filtered.groupby('month_diff')
    fig = go.Figure()

    for gp in df_pix_months:
      fig.add_trace(go.Box(y=gp[1]['n_change_pix'].values, name=gp[0]))

    fig.update_layout(
        title=title,
        xaxis_title="Difference in Number of Months",
        yaxis_title="Number of Pixels Containing Change",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="#7f7f7f"
        )
    )
    fig.show()

In [None]:
plot_monthly_change_statistics(df_pix)

In [None]:
plot_monthly_change_statistics(df_pix,threshold=850)

In [None]:
plot_monthly_change_statistics(df_pix,threshold=150)

Notice how the median and the variance vary as we increase the number of months between our images. This important to consider when training our model. Our model is more likely to perform better on images containing a higher number of pixels denoting change. 

Now that we added these extra columns, we can translate this new knowledge directly into our evaluation metrics to see where our model performs well and where it performs poorly.

One of the possible reasons that we our data varied so much when we grouped by location, is because we didn't account for the monthly change. As difference in number of months grows between the images, the more likely we are to see higher number of pixels change.

## Splitting Dataset to Training and Validation Sets
Now that we have visualized how our datasets look like, let us manually choose the indices of the locations that we would like to have in our validation set. With our new found information, we will now split our data based on the location.

When splitting our dataset we would ideally like to make it as representitive as the real data as possible, therefore we will split based on the categories that we obtained earlier. We will choose one from each so we can make sure that our training and validation data are atleast from the same distribution when it comes to the amount of change and per location and the number of pixels that change if change exists.

The indices below are the indices found in the location plots that we plotted earlier.

In [None]:
valid_indices = [2,22,17,39,43]

In [None]:
location_names = stats_df['location_names']

Let's start by returning the names of the validation set based on the indices that we extracted earlier.

In [None]:
valid_names = location_names.iloc[valid_indices]
valid_names

Below we create a simple mask by looping over the names column of our dataframe and ensuing that they are part othe names we extracted earlier.

In [None]:
valid_df_mask = df['im_name'].progress_map(lambda x: True if x in valid_names.values else False)

In [None]:
train_df_mask = ~valid_df_mask

In [None]:
train_df_mask.value_counts()

In [None]:
valid_df_mask.value_counts()

In [None]:
train_df = df[train_df_mask].reset_index(drop=True)
train_df = train_df[train_df.target == 1]
train_df.head()

In [None]:
valid_df = df[valid_df_mask].reset_index(drop=True)
valid_df = valid_df[valid_df.target == 1]
valid_df.head()

In [None]:
def get_change_pct(row):
    """Helper to get change mask percentage"""
    ch = np.abs(np.divide(io.imread(root_dir/row['mask_path']),255))
    shp = ch.shape[0] * ch.shape[1]
    return np.sum(ch)/shp

train_df['ch_pct'] = train_df.progress_apply(lambda x: get_change_pct(x),axis=1)
valid_df['ch_pct'] = valid_df.progress_apply(lambda x: get_change_pct(x),axis=1)

In [None]:
# throwing out thrashy change
train_df = train_df[train_df.ch_pct > 0.01]
valid_df = valid_df[train_df.ch_pct > 0.01]

## Saving the Output CSVs

In [None]:
valid_df.to_csv('valid_csv.csv',index=False)
train_df.to_csv('train_csv.csv',index=False)

Finally we can use the dfs above directly in our future notebooks as inputs.

In [None]:
train_df.describe()

In [None]:
valid_df.describe()