In [1]:
import pandas as pd

class BirdStrikeDataCleaner:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None

    def load_data(self):
        """Load the CSV data."""
        try:
            self.data = pd.read_csv(self.file_path)
            if self.data is not None:
                print("Data loaded successfully.")
            else:
                raise ValueError("Data could not be loaded.")
        except Exception as e:
            print(f"Error loading data: {e}")
            raise

    def remove_duplicates(self):
        """Remove duplicate rows."""
        if self.data is not None:
            initial_shape = self.data.shape
            self.data = self.data.drop_duplicates()
            print(f"Removed {initial_shape[0] - self.data.shape[0]} duplicate rows.")

    def handle_missing_values(self):
        """Handle missing values."""
        if self.data is not None:
            # Drop columns with more than 50% missing values
            threshold = len(self.data) * 0.5
            self.data = self.data.dropna(thresh=threshold, axis=1)

            # Fill numeric columns with median, categorical columns with mode
            for col in self.data.columns:
                if self.data[col].dtype == 'object':
                    self.data[col].fillna(self.data[col].mode()[0], inplace=True)
                else:
                    self.data[col].fillna(self.data[col].median(), inplace=True)

    def convert_data_types(self):
        """Convert data types to appropriate format."""
        if self.data is not None and 'FlightDate' in self.data.columns:
            self.data['FlightDate'] = pd.to_datetime(self.data['FlightDate'], errors='coerce')
            self.data = self.data.dropna(subset=['FlightDate'])

    def drop_irrelevant_columns(self):
        """Drop irrelevant columns."""
        if self.data is not None:
            drop_cols = ['ID', 'Index', 'RecordID']
            self.data = self.data.drop([col for col in drop_cols if col in self.data.columns], axis=1)

    def clean_data(self):
        """Complete data cleaning pipeline."""
        self.load_data()
        if self.data is not None:
            self.remove_duplicates()
            self.handle_missing_values()
            self.convert_data_types()
            self.drop_irrelevant_columns()
            print("Data cleaned successfully!")
        return self.data

# Initialize and clean data
file_path = '../data/Bird_strikes.csv'

cleaner = BirdStrikeDataCleaner(file_path)
cleaned_data = cleaner.clean_data()

# Check cleaned data
if cleaned_data is not None:
    cleaned_data.info()
    print(cleaned_data.head())
    


Data loaded successfully.
Removed 0 duplicate rows.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data[col].fillna(self.data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data[col].fillna(self.data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object o

Data cleaned successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25429 entries, 0 to 25428
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   AircraftType              25429 non-null  object        
 1   AirportName               25429 non-null  object        
 2   AltitudeBin               25429 non-null  object        
 3   MakeModel                 25429 non-null  object        
 4   NumberStruck              25429 non-null  object        
 5   NumberStruckActual        25429 non-null  int64         
 6   FlightDate                25429 non-null  datetime64[ns]
 7   Damage                    25429 non-null  object        
 8   Engines                   25429 non-null  object        
 9   Operator                  25429 non-null  object        
 10  OriginState               25429 non-null  object        
 11  FlightPhase               25429 non-null  object     