##  Import Packages

In [None]:
#Import Packages

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import scipy.stats as stats
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'


## Explore the files in train_audio directory

In this section, we are doing sneak peek into different files present in train_audio directory.  

For that first import the file into pandas dataframe.

Then apply following commands on dataframe.

head() - list top few rows.

info() - tells about different columns, their types and whether they have null or non null values.

shape - number of rows and columns.

value_counts() - find values taken by a column.


###  1. sample_submission.csv 

In [None]:
# sample_submission data

sub_df = pd.read_csv('../input/birdsong-recognition/sample_submission.csv')
sub_df.head()
sub_df.info()
sub_df.shape
sub_birds_counts = sub_df['birds'].value_counts()
sub_birds_counts

### 2. example_test_audio_summary.csv

In [None]:
# example_test_audio_summary data

audio_sum_df = pd.read_csv('../input/birdsong-recognition/example_test_audio_summary.csv')
audio_sum_df.head(3)

audio_sum_df.info()
audio_sum_df.shape
audio_sum_birds_counts = audio_sum_df['birds'].value_counts()
audio_sum_birds_counts

### 3. example_test_audio_metadata.csv

In [None]:
# Draws pie-chart from pandas Series
def draw_pie_chart(count_series, title, hole=0):
    labels = count_series.index
    sizes = count_series.values

    trace = go.Pie(labels=labels, values=sizes, hole=hole)

    layout = go.Layout(
        title=dict(
            text=title,
            y=0.9,
            x=0.5,
            xanchor= 'center',
            yanchor= 'top'
        ),
        
        font=dict(
            family="Arial",
            size=14,
            color="#7f7f7f"
        ),
    )

    data = [trace]

    fig = go.Figure(data=data, layout=layout)

    fig.show()

In [None]:
# example_test_audio_metadata data

audio_meta_df = pd.read_csv('../input/birdsong-recognition/example_test_audio_metadata.csv')
audio_meta_df.head(3)

audio_meta_df.info()
audio_meta_df.shape
audio_meta_birds_counts = audio_meta_df['ebird_code'].value_counts()
audio_meta_birds_counts
draw_pie_chart(audio_meta_birds_counts,'ebird_code',hole=.5)



In [None]:
audio_meta_device_counts = audio_meta_df['device'].value_counts()
audio_meta_device_counts
draw_pie_chart(audio_meta_device_counts,'device',hole=.5)




In [None]:
audio_meta_source_counts = audio_meta_df['source'].value_counts()
audio_meta_source_counts
draw_pie_chart(audio_meta_source_counts,'source',hole=.5)




In [None]:
audio_meta_vox_type_counts = audio_meta_df['vox_type'].value_counts()
audio_meta_vox_type_counts
draw_pie_chart(audio_meta_vox_type_counts,'vox_type',hole=.5)




In [None]:
audio_meta_channel_counts = audio_meta_df['channel'].value_counts()
audio_meta_channel_counts
draw_pie_chart(audio_meta_channel_counts,'channel',hole=.5)


### 4. test.csv

In [None]:
# test data

test_df = pd.read_csv('../input/birdsong-recognition/test.csv')
test_df.head(3)

test_df.info()
test_df.shape


### 5. train.csv

In [None]:
# training data

train_df = pd.read_csv('../input/birdsong-recognition/train.csv')
train_df.head(3)
train_df.info()
train_df.shape


### Training Data Description
#### Types of Columns


1 Non Numeric - Categorical - Object type - There are 32 columns of this type

2 Numerical columns - There are 3 columns of numeric types

2.1 int64 - whole numbers - duration, xc_id

2.2 float64 - decimal numbers - rating

### Numerical data

In [None]:
#Summarize data
train_df.describe()

In [None]:
#Plot a histogram of that column
# hist() function from Matplotlib
rating = train_df['rating']
plt.hist(rating, color='blue', align='left',edgecolor="black")
plt.title('rating') 
plt.ylabel('Frequency')
plt.xlabel('rating')

In [None]:
#Plot a histogram of that column
# hist() function from Matplotlib
rating = train_df['duration']
plt.hist(rating, color='orange', align='left',edgecolor="black")
plt.title('duration') 
plt.ylabel('Frequency')
plt.xlabel('duration')

### Categorical data

In [None]:
cat_train_df = train_df.select_dtypes(include='object')
cat_train_df.head()
cat_train_df.info()

In [None]:
def display_columns_uniqvals(df):
    for i, col in enumerate(df.columns.tolist()):
        if i % 5 == 0:
            print ("-------------------------------------------")
        print('\n ({} {}) \n Missing: {}     Uniq_val_sz: {}     \n Uniq_vals: {}\n\n'.format(i,col, df[col].isnull().sum(), df[col].unique().size, df[col].unique()))
    print('\n')

In [None]:
display_columns_uniqvals(cat_train_df)

## Single Variable analysis

Let us understand each column one by one and group similar columns
Let peek into data to understand each column and various values it can take.

Lets start with Categorical variables.



1. playback_used - 
It contains data of string type. 
It has 3 kinds of values no, nan, yes. 
I am curious to know how many and of which type.


In [None]:
# Choosing ggplot style
plt.style.use('ggplot')

# Get the figure and the axes (or subplots)
fig, (ax0, ax1, ax2) = plt.subplots(nrows=1, ncols=3, figsize=(15, 4))

# value_counts
playback_used_counts = train_df['playback_used'].value_counts().sort_index()

# Prepare data for ax0  
print(playback_used_counts)
print()
x = playback_used_counts.index.values
height = playback_used_counts.values

# Axes.bar(x, height, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
ax0.bar(x, height, width=0.5, align='center')
ax0.set(title = 'playback_used_counts ', xlabel='playback_used' , ylabel = 'Frequency')
ax0.set_xticks([0, 1])



# Prepare data for ax1  
# value_counts
pitch_counts = train_df['pitch'].value_counts().sort_index()
print(pitch_counts)

print()
x = pitch_counts.index.values
height = pitch_counts.values

# Axes.bar(x, height, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
ax1.bar(x, height, width=0.5, align='center')
ax1.set(title = 'pitch Counts', xlabel='pitch' , ylabel = 'Frequency')



# Prepare data for ax2  

# Prepare data for ax1  
# value_counts
channel_counts = train_df['channels'].value_counts().sort_index()
print(channel_counts)

print()
x = channel_counts.index.values
height = channel_counts.values

# Axes.bar(x, height, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
ax2.bar(x, height, width=0.5, align='center')
ax2.set(title = 'channel Counts', xlabel='channel' , ylabel = 'Frequency')
ax2.set_xticks([0, 1])

# Title the figure
fig.suptitle('Frequency Counts', fontsize=14, fontweight='bold');

In [None]:
# Choosing ggplot style
plt.style.use('seaborn-whitegrid')

# Get the figure and the axes (or subplots)
fig, (ax0, ax1, ax2) = plt.subplots(nrows=1, ncols=3, figsize=(15, 4))

# value_counts
speed_counts = train_df['speed'].value_counts().sort_index()

# Prepare data for ax0  
print(speed_counts)
print()
x = speed_counts.index.values
height = speed_counts.values

# Axes.bar(x, height, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
ax0.bar(x, height, width=0.5, align='center')
ax0.set(title = 'speed_counts ', xlabel='speed' , ylabel = 'Frequency')
ax0.set_xticks([0, 1])



# Prepare data for ax1  
# value_counts
number_of_notes_counts = train_df['number_of_notes'].value_counts().sort_index()
print(number_of_notes_counts)

print()
x = number_of_notes_counts.index.values
height = number_of_notes_counts.values

# Axes.bar(x, height, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
ax1.bar(x, height, width=0.5, align='center')
ax1.set(title = 'number_of_notes Counts', xlabel='number_of_notes' , ylabel = 'Frequency')




# Prepare data for ax1  
# value_counts
bird_seen_counts = train_df['bird_seen'].value_counts().sort_index()
print(bird_seen_counts)

print()
x = bird_seen_counts.index.values
height = bird_seen_counts.values

# Axes.bar(x, height, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
ax2.bar(x, height, width=0.5, align='center')
ax2.set(title = 'bird_seen Counts', xlabel='bird_seen' , ylabel = 'Frequency')
ax2.set_xticks([0, 1])

# Title the figure
fig.suptitle('Frequency Counts', fontsize=14, fontweight='bold');

In [None]:
# Choosing ggplot style
plt.style.use('seaborn-darkgrid')

# Get the figure and the axes (or subplots)
fig, (ax0, ax1, ax2) = plt.subplots(nrows=1, ncols=3, figsize=(15, 4))

# Prepare data for ax0  
# value_counts
volume_counts = train_df['volume'].value_counts().sort_index()
print(volume_counts)

print()
x = volume_counts.index.values
height = volume_counts.values

# Axes.bar(x, height, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
ax0.bar(x, height, width=0.5, align='center')
ax0.set(title = 'volume Counts', xlabel='volume' , ylabel = 'Frequency')




# Prepare data for ax2  
# value_counts
length_counts = train_df['length'].value_counts().sort_index()
print(length_counts)

print()
x = length_counts.index.values
height = length_counts.values

# Axes.bar(x, height, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
ax1.bar(x, height, width=0.5, align='center')
ax1.set(title = 'length Counts', xlabel='length' , ylabel = 'Frequency')



# Prepare data for ax3  
# value_counts
license_counts = train_df['license'].value_counts().sort_index()
print(license_counts)

print()
x = license_counts.index.values
height = license_counts.values

# Axes.bar(x, height, width=0.8, bottom=None, *, align='center', data=None, **kwargs)
ax2.bar(x, height, width=0.5, align='center')
ax2.set(title = 'license Counts', xlabel='license' , ylabel = 'Frequency')


# Title the figure
fig.suptitle('Frequency Counts', fontsize=14, fontweight='bold');

In [None]:
draw_pie_chart(train_df['species'].value_counts().sort_index() , 'channels', .4)

### Conclusion

Thus we have grouped similar attributes and presented their frequecy counts to analyse the data. 

Hope you enjoyed reading it so far.

Kindly upvote if you found it useful.

Thanks