# CLEANING

## IMPORTING THE REQUIRED LIBRARIES

In [1]:
import pandas as pd
import numpy as np

## IMPORTING THE TRAINING AND TESTING DATAFRAMES

In [2]:
def import_data(path):
    dataframe = pd.read_csv(path)
    if 'Unnamed: 0' in dataframe.columns:
        return dataframe.drop('Unnamed: 0', axis = 1)
    return dataframe

passengers_training = import_data('../data/passengers_training.csv')
passengers_testing = import_data('../data/passengers_testing.csv')

## DROPPING THE ID COLUMN FROM BOTH DATAFRAMES

In [3]:
def drop_id(dataframe, id):
    return dataframe.drop(id, axis = 1)

passengers_training = drop_id(passengers_training, 'id')
passengers_testing = drop_id(passengers_testing, 'id')

## NORMALIZING THE NAMES OF THE DATAFRAMES COLUMNS

In [4]:
def normalize_columns(dataframe):
    normalized_columns = list()
    for column in dataframe.columns:
        normalized_columns.append(column.replace(' ', '_').lower())
    dataframe.columns = normalized_columns
    return dataframe

passengers_training = normalize_columns(passengers_training)
passengers_testing = normalize_columns(passengers_testing)

## RENAMING THE VALUES OF THE CUSTOMER TYPE, TYPE OF TRAVEL AND SATISFACTION COLUMNS

In [5]:
def replace_values(dataframe, column, replacements):
    dataframe[column] = dataframe[column].replace(replacements)
    return dataframe

customer_type_replacements = {'disloyal Customer': 'Disloyal Customer'}
type_of_travel_replacements = {'Business travel': 'Business Travel'}
satisfaction_replacements = {'neutral or dissatisfied': 'Not satisfied', 'satisfied': 'Satisfied'}

passengers_training = replace_values(passengers_training, 'customer_type', customer_type_replacements)
passengers_training = replace_values(passengers_training, 'type_of_travel', type_of_travel_replacements)
passengers_training = replace_values(passengers_training, 'satisfaction', satisfaction_replacements)

passengers_testing = replace_values(passengers_testing, 'customer_type', customer_type_replacements)
passengers_testing = replace_values(passengers_testing, 'type_of_travel', type_of_travel_replacements)
passengers_testing = replace_values(passengers_testing, 'satisfaction', satisfaction_replacements)

## CHANGING THE TYPE OF THE ARRIVAL DELAY IN MINUTES TO INT64

In [6]:
def change_type(dataframe, column, new_type):
    dataframe[column] = dataframe[column].astype(new_type)
    return dataframe

passengers_training = change_type(passengers_training, 'arrival_delay_in_minutes', 'Int64')
passengers_testing = change_type(passengers_testing, 'arrival_delay_in_minutes', 'Int64')

## EXPORTING THE CLEANED TRAINING AND TESTING DATAFRAMES

In [7]:
def export_data(dataframe, path, filename):
    return dataframe.to_csv(f'{path}/{filename}.csv', index = False)

export_data(passengers_training, 'data', 'training_cleaned')
export_data(passengers_training, 'data', 'testing_cleaned')