## Summary of Flight Call data 

### Installed the required packages 
- Installed the required packages and also importing the functions required from other python files

In [11]:
import logging
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import sys
from os import environ
from IPython import embed
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from data_loading import data_loading, dataframe_columns_check
from data_cleaning import column_renaming, removing_duplicates, removing_missing_values
from helpers import data_summary

### Reading the raw data 
- Have passed the location of the raw data as a environment variable

In [12]:
# Path of the uploaded json file
path = environ['flight_call']

# Loading the data and assigning a name to the dataframe
flight_call = data_loading(path)
flight_call.name = 'flight_call'

### Checking if the required set of columns are present in the raw data

In [13]:
# Columns which are required in the uploaded data
required_columns = ['Species', 'Family', 'Collisions', 'Flight', 'Call', 'Habitat','Stratum']

# checking the required set of columns are present in the uploaded file
if dataframe_columns_check(flight_call, required_columns):
    print ("Columns in raw data:", list(flight_call.columns))
    print ("\nRequired columns:", required_columns)
    print ("\nCheck: The uploaded data has the required set of columns")
else:
    print ("Check: The uploaded data does not have the required set of columns")

Columns in raw data: ['Species', 'Family', 'Collisions', 'Flight', 'Call', 'Habitat', 'Stratum']

Required columns: ['Species', 'Family', 'Collisions', 'Flight', 'Call', 'Habitat', 'Stratum']

Check: The uploaded data has the required set of columns


### Renaming the columns as required 

In [14]:
print("Columns in raw data: ", list(flight_call.columns))
flight_call = column_renaming(flight_call, required=True)
print("\nColumns in processed data: ", list(flight_call.columns))

Columns in raw data:  ['Species', 'Family', 'Collisions', 'Flight', 'Call', 'Habitat', 'Stratum']

Columns in processed data:  ['Genus', 'Species', 'Family', 'Flight', 'Flight Call', 'Habitat', 'Stratum']


### Removing the duplicate rows in the data 

In [15]:
print("Row count in the raw data: ", flight_call.shape[0])
flight_call = removing_duplicates(flight_call)
print("Row count after removing duplicates: ", flight_call.shape[0])

Row count in the raw data:  96
Row count after removing duplicates:  91


### MIssing values treatment 

In [16]:
print("Row count in the raw data: ", flight_call.shape[0])
flight_call = removing_missing_values(flight_call)
print("Row count after removing rows with missing values: ", flight_call.shape[0])

Row count in the raw data:  91
Row count after removing rows with missing values:  91


### Data exploration analysis 

#### Basic data description 

In [17]:
data_summary(flight_call)

Dataframe name  : flight_call

Rows     :  91
Columns  :  7

Features : 
 ['Genus', 'Species', 'Family', 'Flight', 'Flight Call', 'Habitat', 'Stratum']

Missing values in each column:   [0, 0, 0, 0, 0, 0, 0]

Unique values :  
 Genus          52
Species        84
Family         15
Flight         81
Flight Call     5
Habitat         6
Stratum         3
dtype: int64


 ######################################


In [18]:
# Anaysing the column data type
flight_call.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 0 to 90
Data columns (total 7 columns):
Genus          91 non-null object
Species        91 non-null object
Family         91 non-null object
Flight         91 non-null int64
Flight Call    91 non-null object
Habitat        91 non-null object
Stratum        91 non-null object
dtypes: int64(1), object(6)
memory usage: 5.7+ KB


### Saving the processed data 

In [None]:
csv_files_address = "/home/sahit/Documents/MLE_opportunities_with_Tiger_Analytics/simple_dj_docker/media/CSV_files/"
file_name = os.path.join(csv_files_address,os.path.basename(path).split('.')[0])
flight_call.to_csv(file_name+'.csv', index=False)