# Read the Data

In [40]:
import pandas as pd
import numpy as np
import matplotlib
import sklearn as skl

# Read in the data 
file_path = "Resources/train.csv"
airline_df = pd.read_csv(file_path)
airline_df.head(20)

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
5,5,111157,Female,Loyal Customer,26,Personal Travel,Eco,1180,3,4,...,1,3,4,4,4,4,1,0,0.0,neutral or dissatisfied
6,6,82113,Male,Loyal Customer,47,Personal Travel,Eco,1276,2,4,...,2,3,3,4,3,5,2,9,23.0,neutral or dissatisfied
7,7,96462,Female,Loyal Customer,52,Business travel,Business,2035,4,3,...,5,5,5,5,4,5,4,4,0.0,satisfied
8,8,79485,Female,Loyal Customer,41,Business travel,Business,853,1,2,...,1,1,2,1,4,1,2,0,0.0,neutral or dissatisfied
9,9,65725,Male,disloyal Customer,20,Business travel,Eco,1061,3,3,...,2,2,3,4,4,3,2,0,0.0,neutral or dissatisfied


In [41]:
# Get column and row counts

airline_df.shape

(103904, 25)

In [42]:
# Look at columns
airline_df.columns.tolist()

['Unnamed: 0',
 'id',
 'Gender',
 'Customer Type',
 'Age',
 'Type of Travel',
 'Class',
 'Flight Distance',
 'Inflight wifi service',
 'Departure/Arrival time convenient',
 'Ease of Online booking',
 'Gate location',
 'Food and drink',
 'Online boarding',
 'Seat comfort',
 'Inflight entertainment',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Inflight service',
 'Cleanliness',
 'Departure Delay in Minutes',
 'Arrival Delay in Minutes',
 'satisfaction']

In [43]:
# Verify the datatypes
airline_df.dtypes

Unnamed: 0                             int64
id                                     int64
Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness                            int64
Departure 

# Data Cleaning 

In [44]:
# Drop unnecessary columns: 'Unnamed' and 'id'
airline_df = airline_df.drop('Unnamed: 0', axis=1)
airline_df = airline_df.drop('id', axis=1)

In [45]:
# Check for Null values
for column in airline_df.columns:
    print(f"Column{column} has {airline_df[column].isnull().sum()} null values")

ColumnGender has 0 null values
ColumnCustomer Type has 0 null values
ColumnAge has 0 null values
ColumnType of Travel has 0 null values
ColumnClass has 0 null values
ColumnFlight Distance has 0 null values
ColumnInflight wifi service has 0 null values
ColumnDeparture/Arrival time convenient has 0 null values
ColumnEase of Online booking has 0 null values
ColumnGate location has 0 null values
ColumnFood and drink has 0 null values
ColumnOnline boarding has 0 null values
ColumnSeat comfort has 0 null values
ColumnInflight entertainment has 0 null values
ColumnOn-board service has 0 null values
ColumnLeg room service has 0 null values
ColumnBaggage handling has 0 null values
ColumnCheckin service has 0 null values
ColumnInflight service has 0 null values
ColumnCleanliness has 0 null values
ColumnDeparture Delay in Minutes has 0 null values
ColumnArrival Delay in Minutes has 310 null values
Columnsatisfaction has 0 null values


In [46]:
# Use describe() for 'Arrival Delay in Minutes' 
airline_df['Arrival Delay in Minutes'].describe()

count    103594.000000
mean         15.178678
std          38.698682
min           0.000000
25%           0.000000
50%           0.000000
75%          13.000000
max        1584.000000
Name: Arrival Delay in Minutes, dtype: float64

In [38]:
# Fill in null values with mean

airline_df['Arrival_Delay_in_Minutes']=airline_df['Arrival_Delay_in_Minutes'].fillna(airline_df['Arrival_Delay_in_Minutes'].mean())
airline_df['Arrival_Delay_in_Minutes'].isnull().sum()

0

In [24]:
# Check for duplicates

print(f"Duplicate entries: {airline_df.duplicated().sum()}")

Duplicate entries: 0


In [35]:
# Remove spaces in column names

airline_df.columns = airline_df.columns.str.replace(' ','_')
airline_df.head()

Unnamed: 0,Gender,Customer_Type,Age,Type_of_Travel,Class,Flight_Distance,Inflight_wifi_service,Departure/Arrival_time_convenient,Ease_of_Online_booking,Gate_location,...,Inflight_entertainment,On-board_service,Leg_room_service,Baggage_handling,Checkin_service,Inflight_service,Cleanliness,Departure_Delay_in_Minutes,Arrival_Delay_in_Minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
