In [2]:
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 20)

In [3]:
def column_data_type(df: pd.DataFrame) -> pd.DataFrame:
    """
    Retrieve the data types of columns in the DataFrame.
    Parameters:
        df (pd.DataFrame): The input DataFrame.
    Returns:
        pd.DataFrame: A DataFrame with two columns: 'Column Name' and 'd_type'.
    """
    dtype_df = pd.DataFrame({'Column Name': df.columns, 'd_type': df.dtypes.values})
    dtype_df.reset_index(drop=True, inplace=True)

    return dtype_df

In [4]:
def get_null_percentage(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the null percentage for each column in the DataFrame.
    Parameters:
        df (pd.DataFrame): The input DataFrame.
    Returns:
        pd.DataFrame: A DataFrame with two columns: 'Column Name' and 'Null Percentage'.
    """
    null_percentage = (df.isnull().sum() / len(df)) * 100
    null_df = pd.DataFrame({'Column Name': null_percentage.index, 'Null Percentage': null_percentage.values})
    null_df.set_index('Column Name', inplace=True)
    null_df.index.name = None
    return null_df

In [5]:
df_location ='../data/raw/earth_challenge_dataset.csv'
df = pd.read_csv(df_location)

  df = pd.read_csv(df_location)


In [6]:
df.head()

Unnamed: 0,X,Y,OBJECTID,RecordSequenceID,UniqueID,SourceID,LocationFreqID,Location,Dataset,Organization,Other,CountryName_FromSource,SubCountry_L1_FromSource,SubCountry_L2_FromSource,Longitude1,Latitude1,Longitude2,Latitude2,TotalWidth_m,TotalLength_m,TotalArea_Sq_m,ShorelineName,WaterfrontName,BeachAreaLandcover,BeachType,EventType,TotalVolunteers,DateOriginal,DateStandardized,MonthYear,Year,MonthNum,Month,Day,StartTime,DOW,FieldObsevations,DebrisDescription,Totalltems_EventRecord,TotalClassifiedItems_EC2020,PCT_PlasticAndFoam,PCT_Glass_Rubber_Lumber_Metal,SUM_Hard_PlasticBeverageBottle,SUM_Hard_OtherPlasticBottle,SUM_HardOrSoft_PlasticBottleCap,SUM_PlasticOrFoamFoodContainer,SUM_Hard_BucketOrCrate,SUM_Hard_Lighter,SUM_OtherHardPlastic,SUM_PlasticOrFoamPlatesBowlsCup,SUM_HardSoft_PersonalCareProduc,SUM_HardSoftLollipopStick_EarBu,SUM_Soft_Bag,SUM_Soft_WrapperOrLabel,SUM_Soft_Straw,SUM_Soft_OtherPlastic,SUM_Soft_CigaretteButts,SUM_Soft_StringRingRibbon,Fishing_Net,SUM_FishingLineLureRope,Fishing_BuoysAndFloats,SUM_Foam_OtherPlasticDebris,SUM_OtherPlasticDebris,NAME,COUNTRY,ISO_CODE,ISO_CC,ISO_SUB,ADMINTYPE,DISPUTED,NOTES,AUTONOMOUS,COUNTRYAFF,CONTINENT,LAND_TYPE,LAND_RANK,Shape__Area,Shape__Length,Count_,Soft_Sheets2,PlasticStraps2,FishingGlowSticks2,FishingOtherPlasticDebris2
0,-123.435585,38.690549,1,349,MDP-349,40-3153,Blackpoint Beach (Lon -123.4355847 Lat 38.6905...,"Blackpoint Beach, Sonoma, CA, United States",NOAA MDMAP Accumulation Survey,California Coast National Monument Task Force,,United States,CA,Sonoma,-123.435585,38.690549,-123.432939,38.689234,11.887,898.0,,Blackpoint Beach,,,,Marine Debris Accumulation Survey,3.0,2015/01/04 00:00:00+00,2015/01/04 00:00:00+00,Jan-2015,2015,1,Jan,4,4:10 PM,Sunday,,"On other: 1 plastic wine cork, 1 plastic child...",28,24,85.7,14.3,1,0,1,0.0,0,0.0,4,0,0,0,1,1,0.0,1,0.0,0,1,0,0,13,1.0,California,United States,USCA,US,CA,State,0.0,,0.0,United States,North America,Primary land,5.0,41.506355,56.81446,1,0,0,0,0
1,-123.484406,38.728707,2,351,MDP-351,37-3164,Dune Drift Beach (Lon -123.4844062 Lat 38.7287...,"Dune Drift Beach, Sonoma, CA, United States",NOAA MDMAP Accumulation Survey,California Coast National Monument Task Force,,United States,CA,Sonoma,-123.484406,38.728707,-123.487692,38.733347,10.0,190.5,,Dune Drift Beach,,,,Marine Debris Accumulation Survey,1.0,2015/01/10 00:00:00+00,2015/01/10 00:00:00+00,Jan-2015,2015,1,Jan,10,10:00 AM,Saturday,,,19,17,89.5,10.5,0,0,1,0.0,0,0.0,4,0,1,0,0,0,0.0,2,0.0,0,0,0,0,9,0.0,California,United States,USCA,US,CA,State,0.0,,0.0,United States,North America,Primary land,5.0,41.506355,56.81446,1,0,0,0,0
2,-123.4564,38.7132,3,354,MDP-354,59-3175,Ohlson Beach (Lon -123.4564 Lat 38.7132),"Ohlson Beach, Sonoma, CA, United States",NOAA MDMAP Accumulation Survey,California Coast National Monument Task Force,,United States,CA,Sonoma,-123.4564,38.7132,-123.4551,38.7106,65.0,309.0,,Ohlson Beach,,,,Marine Debris Accumulation Survey,1.0,2015/01/14 00:00:00+00,2015/01/14 00:00:00+00,Jan-2015,2015,1,Jan,14,11:30 AM,Wednesday,,,0,0,0.0,0.0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,California,United States,USCA,US,CA,State,0.0,,0.0,United States,North America,Primary land,5.0,41.506355,56.81446,1,0,0,0,0
3,-123.490915,38.735105,4,358,MDP-358,41-3191,Walk On Beach (Lon -123.490915 Lat 38.735105),"Walk On Beach, Sonoma, CA, United States",NOAA MDMAP Accumulation Survey,California Coast National Monument Task Force,,United States,CA,Sonoma,-123.490915,38.735105,-123.489614,38.731897,56.0,300.0,,Walk On Beach,,,,Marine Debris Accumulation Survey,1.0,2015/01/15 00:00:00+00,2015/01/15 00:00:00+00,Jan-2015,2015,1,Jan,15,2:00 PM,Thursday,Clean Beach!,,11,8,72.7,27.3,0,0,0,0.0,0,1.0,5,0,0,0,1,1,0.0,0,0.0,0,0,0,0,0,0.0,California,United States,USCA,US,CA,State,0.0,,0.0,United States,North America,Primary land,5.0,41.506355,56.81446,1,0,0,0,0
4,-124.4621,42.7149,5,360,MDP-360,11-3195,Rocky Point (Lon -124.4621 Lat 42.7149),"Rocky Point, Curry, OR, United States",NOAA MDMAP Accumulation Survey,Redfish Rocks,,United States,OR,Curry,-124.4621,42.7149,-124.462,42.7139,27.5,102.0,,Rocky Point,,,,Marine Debris Accumulation Survey,4.0,2015/01/15 00:00:00+00,2015/01/15 00:00:00+00,Jan-2015,2015,1,Jan,15,10:30 AM,Thursday,None. LOTS of Foam over the past two surveys! ...,,906,893,98.6,1.4,7,0,0,0.0,0,4.0,31,0,0,0,0,0,0.0,0,0.0,0,0,0,1,850,0.0,Oregon,United States,USOR,US,OR,State,0.0,,0.0,United States,North America,Primary land,5.0,28.117307,33.913945,1,0,0,0,0


In [7]:
dtype_df = column_data_type(df)
dtype_df

Unnamed: 0,Column Name,d_type
0,X,float64
1,Y,float64
2,OBJECTID,int64
3,RecordSequenceID,int64
4,UniqueID,object
5,SourceID,object
6,LocationFreqID,object
7,Location,object
8,Dataset,object
9,Organization,object


In [8]:
df.columns

Index(['X', 'Y',
       'OBJECTID',
       'RecordSequenceID',
       'UniqueID',
       'SourceID',
       'LocationFreqID',
       'Location',
       'Dataset',
       'Organization',
       'Other',
       'CountryName_FromSource',
       'SubCountry_L1_FromSource',
       'SubCountry_L2_FromSource',
       'Longitude1',
       'Latitude1',
       'Longitude2',
       'Latitude2',
       'TotalWidth_m',
       'TotalLength_m',
       'TotalArea_Sq_m',
       'ShorelineName',
       'WaterfrontName',
       'BeachAreaLandcover',
       'BeachType',
       'EventType',
       'TotalVolunteers',
       'DateOriginal',
       'DateStandardized',
       'MonthYear',
       'Year',
       'MonthNum',
       'Month',
       'Day',
       'StartTime',
       'DOW',
       'FieldObsevations',
       'DebrisDescription',
       'Totalltems_EventRecord',
       'TotalClassifiedItems_EC2020',
       'PCT_PlasticAndFoam',
       'PCT_Glass_Rubber_Lumber_Metal',
       'SUM_Hard_PlasticBeverageBo

In [47]:
drop_list = ['TotalArea_Sq_m', 'Other', 'FieldObsevations', 'BeachAreaLandcover', 'BeachType', 'DebrisDescription', \
 'WaterfrontName', 'TotalWidth_m', 'StartTime', 'Longitude2', 'ShorelineName', 'Latitude2']

In [48]:
edc = df.drop(drop_list, axis=1)

In [55]:
null_df = get_null_percentage(edc)
null_df.sort_values(by='Null Percentage',ascending=False).head(10)

Unnamed: 0,Null Percentage
Organization,19.85916
SubCountry_L1_FromSource,11.210193
SubCountry_L2_FromSource,10.414062
NOTES,8.990954
CountryName_FromSource,7.310436
Location,5.968228
DateOriginal,4.445834
TotalVolunteers,4.445834
SUM_OtherPlasticDebris,4.445834
SourceID,4.445834


In [54]:
null_df = get_null_percentage(df)
null_df.sort_values(by='Null Percentage',ascending=False).head(10)

Unnamed: 0,Null Percentage
TotalArea_Sq_m,100.0
Other,100.0
FieldObsevations,98.049202
BeachAreaLandcover,97.959109
BeachType,97.951754
DebrisDescription,97.935206
WaterfrontName,97.598735
TotalWidth_m,96.100243
StartTime,96.100243
Longitude2,91.654409


In [56]:
def count_null_rows(df: pd.DataFrame) -> pd.DataFrame:
    """
    Counts the number of null values in each row of the DataFrame.

    Parameters:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A DataFrame with two columns: 'Index' and 'Null Percentage'.
    """
    null_counts = df.isnull().sum(axis=1)
    null_percentage = (null_counts / df.shape[1]) * 100
    null_df = pd.DataFrame({'Index': df.index, 'Null Percentage': null_percentage})
    null_df = null_df.sort_values(by='Null Percentage', ascending=False).reset_index(drop=True)

    return null_df

In [68]:
null_rows = count_null_rows(df)
null_rows

Unnamed: 0,Index,Null Percentage
0,2449,39.759036
1,4500,39.759036
2,4261,39.759036
3,4066,39.759036
4,4070,39.759036
...,...,...
54383,623,6.024096
54384,622,6.024096
54385,621,6.024096
54386,620,6.024096


In [69]:
null_rows['Null Percentage'].unique()

array([39.75903614, 38.55421687, 37.34939759, 36.14457831, 34.93975904,
       33.73493976, 32.53012048, 26.5060241 , 25.30120482, 24.09638554,
       22.89156627, 21.68674699, 20.48192771, 19.27710843, 18.07228916,
       16.86746988, 15.6626506 , 14.45783133,  9.63855422,  8.43373494,
        7.22891566,  6.02409639])

In [71]:
df.iloc[2449]

X                                                                                                                0.000137
Y                                                                                                                 0.00036
OBJECTID                                                                                                             2819
RecordSequenceID                                                                                                     3314
UniqueID                                                                                                         MLW-3328
SourceID                                                                                                              NaN
LocationFreqID                     Spiaggia delle Saline Centola (SA) (Lon 0.000137295813083419 Lat 0.000359757304987417)
Location                                                                           Spiaggia delle Saline Centola (SA), IT
Dataset                 