# DATA CHALLENGE


### DATA QUALITY CHECK

In [None]:
# required python libraries for data
import numpy as np
import pandas as pd
import re 
from datetime import datetime

In [91]:
# import dataset and read the data
tickets_df = pd.read_csv("Tickets.csv")
# Taking a backup of the original data
tickets_backup = tickets_df.copy()
# head returns the first 5 rows of the Tickets.csv data to get an idea of the data
tickets_df.head(5)

Unnamed: 0,ITIN_ID,YEAR,QUARTER,ORIGIN,ORIGIN_COUNTRY,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,ROUNDTRIP,REPORTING_CARRIER,PASSENGERS,ITIN_FARE,DESTINATION
0,201912723049,2019,1,ABI,US,TX,Texas,1.0,MQ,1.0,736.0,DAB
1,201912723085,2019,1,ABI,US,TX,Texas,1.0,MQ,1.0,570.0,COS
2,201912723491,2019,1,ABI,US,TX,Texas,1.0,MQ,1.0,564.0,MCO
3,201912723428,2019,1,ABI,US,TX,Texas,1.0,MQ,1.0,345.0,LGA
4,201912723509,2019,1,ABI,US,TX,Texas,0.0,MQ,1.0,309.0,MGM


In [92]:
# Understand the shape of the dataset with the no. of rows and cols
tickets_df.shape
# Checking for index
tickets_df.index.values

array([      0,       1,       2, ..., 1167282, 1167283, 1167284])

In [208]:
# Print the information, shape and data type for the data frame
tickets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167285 entries, 0 to 1167284
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   ITIN_ID            1167285 non-null  int64  
 1   YEAR               1167285 non-null  int64  
 2   QUARTER            1167285 non-null  int64  
 3   ORIGIN             1167285 non-null  object 
 4   ORIGIN_COUNTRY     1167285 non-null  object 
 5   ORIGIN_STATE_ABR   1167285 non-null  object 
 6   ORIGIN_STATE_NM    1167285 non-null  object 
 7   ROUNDTRIP          1167285 non-null  float64
 8   REPORTING_CARRIER  1167285 non-null  object 
 9   PASSENGERS         1165308 non-null  float64
 10  ITIN_FARE          1166325 non-null  object 
 11  DESTINATION        1167285 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 106.9+ MB


In [209]:
# Excluding cols that are unnecessary to the dataset
drop_cols = ['ITIN_ID','YEAR','QUARTER','ORIGIN_COUNTRY','ORIGIN_STATE_NM']
tickets = tickets_df.drop(drop_cols, axis = 1)

In [95]:
# Determine the count of unique values in each col in the data frame
tickets.nunique()

ORIGIN                419
ORIGIN_STATE_ABR       52
ROUNDTRIP               2
REPORTING_CARRIER      21
PASSENGERS            286
ITIN_FARE            3791
DESTINATION           410
dtype: int64

In [96]:
# Checking any rows are missing any data
tickets.isnull().sum()

ORIGIN                  0
ORIGIN_STATE_ABR        0
ROUNDTRIP               0
REPORTING_CARRIER       0
PASSENGERS           1977
ITIN_FARE             960
DESTINATION             0
dtype: int64

In [112]:
# Function to remove random characters in cols
def clean_value(value):
    value = re.sub(r'[^0-9]+','',value)
    if value == '':
            value = np.nan

tickets['ITIN_FARE'] = tickets['ITIN_FARE'].apply(lambda x: clean_value(x))


In [113]:
# Filling the missing values in the Passengers and ITIN FARES
tickets = tickets.fillna(0)

In [177]:
# As it can be seen from the information above:
# Roundtrip col comprising of 0 and 1 should be converted to type bool from type float
# Passengers col being of type float does not make sense and should be converted to type int
# Iternary fare col is of type object which does not make sense as it describes fare and should be converted to type float
# implementing these conversions

tickets['PASSENGERS'] = tickets['PASSENGERS'].astype('int64')
tickets['ITIN_FARE'] = tickets['ITIN_FARE'].astype('float64')
tickets.astype({'ROUNDTRIP':'bool'})


Unnamed: 0,ORIGIN,ORIGIN_STATE_ABR,ROUNDTRIP,REPORTING_CARRIER,PASSENGERS,ITIN_FARE,DESTINATION
0,ABI,TX,True,MQ,1,0.0,DAB
1,ABI,TX,True,MQ,1,0.0,COS
2,ABI,TX,True,MQ,1,0.0,MCO
3,ABI,TX,True,MQ,1,0.0,LGA
4,ABI,TX,False,MQ,1,0.0,MGM
...,...,...,...,...,...,...,...
1167280,YAK,AK,False,AS,1,0.0,ANC
1167281,YAK,AK,True,AS,1,0.0,JNU
1167282,YAK,AK,False,AS,1,0.0,JNU
1167283,YAK,AK,False,AS,1,0.0,ANC


In [116]:
# import dataset and read the data
flights_df = pd.read_csv("Flights.csv")
# Taking a backup of the original data
flights_backup = flights_df.copy()
# head returns the first 5 rows of the Flights.csv data to get an idea of the data
flights_df.head(5)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,FL_DATE,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DESTINATION,DEST_CITY_NAME,DEP_DELAY,ARR_DELAY,CANCELLED,AIR_TIME,DISTANCE,OCCUPANCY_RATE
0,2019-03-02,WN,N955WN,4591,14635,RSW,"Fort Myers, FL",11042,CLE,"Cleveland, OH",-8.0,-6.0,0.0,143.0,1025.0,0.97
1,2019-03-02,WN,N8686A,3231,14635,RSW,"Fort Myers, FL",11066,CMH,"Columbus, OH",1.0,5.0,0.0,135.0,930.0,0.55
2,2019-03-02,WN,N201LV,3383,14635,RSW,"Fort Myers, FL",11066,CMH,"Columbus, OH",0.0,4.0,0.0,132.0,930.0,0.91
3,2019-03-02,WN,N413WN,5498,14635,RSW,"Fort Myers, FL",11066,CMH,"Columbus, OH",11.0,14.0,0.0,136.0,930.0,0.67
4,2019-03-02,WN,N7832A,6933,14635,RSW,"Fort Myers, FL",11259,DAL,"Dallas, TX",0.0,-17.0,0.0,151.0,1005.0,0.62


In [118]:
# understand the shape of the dataset with the no. of rows and cols
flights_df.shape
# checking for index
flights_df.index.values

array([      0,       1,       2, ..., 1915883, 1915884, 1915885])

In [120]:
# Print the information, shape and data type for the data frame
flights_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1915886 entries, 0 to 1915885
Data columns (total 16 columns):
 #   Column             Dtype  
---  ------             -----  
 0   FL_DATE            object 
 1   OP_CARRIER         object 
 2   TAIL_NUM           object 
 3   OP_CARRIER_FL_NUM  object 
 4   ORIGIN_AIRPORT_ID  int64  
 5   ORIGIN             object 
 6   ORIGIN_CITY_NAME   object 
 7   DEST_AIRPORT_ID    int64  
 8   DESTINATION        object 
 9   DEST_CITY_NAME     object 
 10  DEP_DELAY          float64
 11  ARR_DELAY          float64
 12  CANCELLED          float64
 13  AIR_TIME           object 
 14  DISTANCE           object 
 15  OCCUPANCY_RATE     float64
dtypes: float64(4), int64(2), object(10)
memory usage: 233.9+ MB


In [122]:
# Excluding cols that are unnecessary to the dataset
drop_col = ['OP_CARRIER_FL_NUM','ORIGIN_AIRPORT_ID','DEST_AIRPORT_ID']
flights = flights_df.drop(drop_col, axis = 1)

In [124]:
# Determine the count of unique values in each col in the data frame
flights.nunique()

FL_DATE              134
OP_CARRIER            26
ORIGIN               361
ORIGIN_CITY_NAME     355
DESTINATION          361
DEST_CITY_NAME       355
DEP_DELAY           1358
ARR_DELAY           1393
CANCELLED              2
AIR_TIME            1208
DISTANCE            1956
OCCUPANCY_RATE       545
dtype: int64

In [126]:
# Checking any rows are missing any data
flights.isnull().sum()

FL_DATE                 0
OP_CARRIER              0
ORIGIN                  0
ORIGIN_CITY_NAME        0
DESTINATION             0
DEST_CITY_NAME          0
DEP_DELAY           50351
ARR_DELAY           55991
CANCELLED               0
AIR_TIME            56551
DISTANCE              630
OCCUPANCY_RATE        310
dtype: int64

In [168]:
# Change the datatype of FL_DATE to datetime
flights['FL_DATE'] = pd.to_datetime(flights['FL_DATE'], infer_datetime_format=True)
# Change the dataytpe of ARR_DELAY, DEP_DELAY, OCCUPANCY_RATE & DISTANCE to float as object does not make sense
flights['DEP_DELAY'] = flights['DEP_DELAY'].astype('float64')
flights['ARR_DELAY'] = flights['ARR_DELAY'].astype('float64')
flights['OCCUPANCY_RATE'] = flights['OCCUPANCY_RATE'].astype('float64')
# Cancelled col comprising of 0 and 1 should be converted to type bool from type float
flights.astype({'CANCELLED':'bool'})


Unnamed: 0,FL_DATE,OP_CARRIER,ORIGIN,ORIGIN_CITY_NAME,DESTINATION,DEST_CITY_NAME,DEP_DELAY,ARR_DELAY,CANCELLED,AIR_TIME,DISTANCE,OCCUPANCY_RATE
0,2019-03-02,WN,RSW,"Fort Myers, FL",CLE,"Cleveland, OH",-8.0,-6.0,False,,,0.970000
1,2019-03-02,WN,RSW,"Fort Myers, FL",CMH,"Columbus, OH",1.0,5.0,False,,,0.550000
2,2019-03-02,WN,RSW,"Fort Myers, FL",CMH,"Columbus, OH",0.0,4.0,False,,,0.910000
3,2019-03-02,WN,RSW,"Fort Myers, FL",CMH,"Columbus, OH",11.0,14.0,False,,,0.670000
4,2019-03-02,WN,RSW,"Fort Myers, FL",DAL,"Dallas, TX",0.0,-17.0,False,,,0.620000
...,...,...,...,...,...,...,...,...,...,...,...,...
1915881,2019-03-23,AA,TUL,"Tulsa, OK",CLT,"Charlotte, NC",-9.0,-6.0,False,,,0.794884
1915882,2019-03-24,AA,TUL,"Tulsa, OK",CLT,"Charlotte, NC",-2.0,-1.0,False,,,0.538399
1915883,2019-03-25,AA,TUL,"Tulsa, OK",CLT,"Charlotte, NC",-8.0,-25.0,False,,,0.955579
1915884,2019-03-26,AA,TUL,"Tulsa, OK",CLT,"Charlotte, NC",-9.0,-6.0,False,,,0.595344


In [178]:
# A function to clean non numeric values and then change the data type of AIR_TIME & DISTANCE to float
def clean_value(value):
    value = re.sub(r'[^0-9]+','',str(value))
    if value == '':
            value = np.nan
            
flights['AIR_TIME'] = flights['AIR_TIME'].apply(lambda x: clean_value(x))
flights['DISTANCE'] = flights['DISTANCE'].apply(lambda x: clean_value(x))
flights['AIR_TIME'] = flights['AIR_TIME'].astype('float64')
flights['DISTANCE'] = flights['DISTANCE'].astype('float64')

In [182]:
# Fill the missing values with the mean
flights['DEP_DELAY'] = flights['DEP_DELAY'].fillna(flights['DEP_DELAY'].mean())
flights['ARR_DELAY'] = flights['ARR_DELAY'].fillna(flights['ARR_DELAY'].mean())
flights['AIR_TIME'] = flights['AIR_TIME'].fillna(flights['AIR_TIME'].mean())
flights['OCCUPANCY_RATE'] = flights['OCCUPANCY_RATE'].fillna(flights['OCCUPANCY_RATE'].mean())
flights['DISTANCE'] = flights['DISTANCE'].fillna(flights['DISTANCE'].mean())

In [186]:
# import dataset and read the data
airports_df = pd.read_csv("Airport_Codes.csv")
# Taking a backup of the original data
airports_backup = airports_df.copy()
# head returns the first 5 rows of the Tickets.csv data to get an idea of the data
airports_df.head(5)

Unnamed: 0,TYPE,NAME,ELEVATION_FT,CONTINENT,ISO_COUNTRY,MUNICIPALITY,IATA_CODE,COORDINATES
0,heliport,Total Rf Heliport,11.0,,US,Bensalem,,"-74.93360137939453, 40.07080078125"
1,small_airport,Aero B Ranch Airport,3435.0,,US,Leoti,,"-101.473911, 38.704022"
2,small_airport,Lowell Field,450.0,,US,Anchor Point,,"-151.695999146, 59.94919968"
3,small_airport,Epps Airpark,820.0,,US,Harvest,,"-86.77030181884766, 34.86479949951172"
4,closed,Newport Hospital & Clinic Heliport,237.0,,US,Newport,,"-91.254898, 35.6087"


In [194]:
# Understand the shape of the dataset with the no. of rows and cols
airports_df.shape
# Checking for index
airports_df.index.values

array([    0,     1,     2, ..., 55366, 55367, 55368])

In [195]:
# Print the information, shape and data type for the data frame
airports_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55369 entries, 0 to 55368
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TYPE          55369 non-null  object 
 1   NAME          55369 non-null  object 
 2   ELEVATION_FT  48354 non-null  float64
 3   CONTINENT     27526 non-null  object 
 4   ISO_COUNTRY   55122 non-null  object 
 5   MUNICIPALITY  49663 non-null  object 
 6   IATA_CODE     9182 non-null   object 
 7   COORDINATES   55369 non-null  object 
dtypes: float64(1), object(7)
memory usage: 3.4+ MB


In [197]:
# Determine the count of unique values in each col in the data frame
airports_df.nunique()

TYPE                7
NAME            52426
ELEVATION_FT     5478
CONTINENT           6
ISO_COUNTRY       242
MUNICIPALITY    27221
IATA_CODE        9062
COORDINATES     55240
dtype: int64

In [200]:
# Checking any rows are missing any data
airports_df.isnull().sum()

TYPE                0
NAME                0
ELEVATION_FT     7015
CONTINENT       27843
ISO_COUNTRY       247
MUNICIPALITY     5706
IATA_CODE       46187
COORDINATES         0
dtype: int64

In [202]:
# Filling the missing values
airports_df.fillna(0)

Unnamed: 0,TYPE,NAME,ELEVATION_FT,CONTINENT,ISO_COUNTRY,MUNICIPALITY,IATA_CODE,COORDINATES
0,heliport,Total Rf Heliport,11.0,0,US,Bensalem,0,"-74.93360137939453, 40.07080078125"
1,small_airport,Aero B Ranch Airport,3435.0,0,US,Leoti,0,"-101.473911, 38.704022"
2,small_airport,Lowell Field,450.0,0,US,Anchor Point,0,"-151.695999146, 59.94919968"
3,small_airport,Epps Airpark,820.0,0,US,Harvest,0,"-86.77030181884766, 34.86479949951172"
4,closed,Newport Hospital & Clinic Heliport,237.0,0,US,Newport,0,"-91.254898, 35.6087"
...,...,...,...,...,...,...,...,...
55364,medium_airport,Yingkou Lanqi Airport,0.0,AS,CN,Yingkou,YKH,"122.3586, 40.542524"
55365,medium_airport,Shenyang Dongta Airport,0.0,AS,CN,Shenyang,0,"123.49600219726562, 41.784400939941406"
55366,heliport,Sealand Helipad,40.0,EU,GB,Sealand,0,"1.4825, 51.894444"
55367,small_airport,Glorioso Islands Airstrip,11.0,AF,TF,Grande Glorieuse,0,"47.296388888900005, -11.584277777799999"


In [207]:
# Split the coordinates col into longitude and latitude
# Create two lists for the loop results to be placed
lat = []
lon = []

# For each row in a varible,
for row in airports_df['COORDINATES']:
    # Try to,
    try:
        # Split the row by comma and append
        # everything before the comma to lat
        lat.append(row.split(',')[1])
        # Split the row by comma and append
        # everything after the comma to lon
        lon.append(row.split(',')[0])
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
airports_df['latitude'] = lat
airports_df['longitude'] = lon

airports_df

Unnamed: 0,TYPE,NAME,ELEVATION_FT,CONTINENT,ISO_COUNTRY,MUNICIPALITY,IATA_CODE,COORDINATES,latitude,longitude
0,heliport,Total Rf Heliport,11.0,,US,Bensalem,,"-74.93360137939453, 40.07080078125",40.07080078125,-74.93360137939453
1,small_airport,Aero B Ranch Airport,3435.0,,US,Leoti,,"-101.473911, 38.704022",38.704022,-101.473911
2,small_airport,Lowell Field,450.0,,US,Anchor Point,,"-151.695999146, 59.94919968",59.94919968,-151.695999146
3,small_airport,Epps Airpark,820.0,,US,Harvest,,"-86.77030181884766, 34.86479949951172",34.86479949951172,-86.77030181884766
4,closed,Newport Hospital & Clinic Heliport,237.0,,US,Newport,,"-91.254898, 35.6087",35.6087,-91.254898
...,...,...,...,...,...,...,...,...,...,...
55364,medium_airport,Yingkou Lanqi Airport,0.0,AS,CN,Yingkou,YKH,"122.3586, 40.542524",40.542524,122.3586
55365,medium_airport,Shenyang Dongta Airport,,AS,CN,Shenyang,,"123.49600219726562, 41.784400939941406",41.784400939941406,123.49600219726562
55366,heliport,Sealand Helipad,40.0,EU,GB,Sealand,,"1.4825, 51.894444",51.894444,1.4825
55367,small_airport,Glorioso Islands Airstrip,11.0,AF,TF,Grande Glorieuse,,"47.296388888900005, -11.584277777799999",-11.584277777799999,47.296388888900005


In [212]:
# Excluding cols that are unnecessary to the dataset
dropcol = ['ELEVATION_FT','CONTINENT','ISO_COUNTRY', 'MUNICIPALITY']
airports = airports_df.drop(dropcol, axis = 1)

In [214]:
# Merging the data frames


Unnamed: 0,TYPE,NAME,IATA_CODE,COORDINATES,latitude,longitude
0,heliport,Total Rf Heliport,,"-74.93360137939453, 40.07080078125",40.07080078125,-74.93360137939453
1,small_airport,Aero B Ranch Airport,,"-101.473911, 38.704022",38.704022,-101.473911
2,small_airport,Lowell Field,,"-151.695999146, 59.94919968",59.94919968,-151.695999146
3,small_airport,Epps Airpark,,"-86.77030181884766, 34.86479949951172",34.86479949951172,-86.77030181884766
4,closed,Newport Hospital & Clinic Heliport,,"-91.254898, 35.6087",35.6087,-91.254898
...,...,...,...,...,...,...
55364,medium_airport,Yingkou Lanqi Airport,YKH,"122.3586, 40.542524",40.542524,122.3586
55365,medium_airport,Shenyang Dongta Airport,,"123.49600219726562, 41.784400939941406",41.784400939941406,123.49600219726562
55366,heliport,Sealand Helipad,,"1.4825, 51.894444",51.894444,1.4825
55367,small_airport,Glorioso Islands Airstrip,,"47.296388888900005, -11.584277777799999",-11.584277777799999,47.296388888900005


Unnamed: 0,ORIGIN,ORIGIN_STATE_ABR,ROUNDTRIP,REPORTING_CARRIER,PASSENGERS,ITIN_FARE,DESTINATION
0,ABI,TX,1.0,MQ,1.0,736.0,DAB
1,ABI,TX,1.0,MQ,1.0,570.0,COS
2,ABI,TX,1.0,MQ,1.0,564.0,MCO
3,ABI,TX,1.0,MQ,1.0,345.0,LGA
4,ABI,TX,0.0,MQ,1.0,309.0,MGM
...,...,...,...,...,...,...,...
1167280,YAK,AK,0.0,AS,1.0,244.0,ANC
1167281,YAK,AK,1.0,AS,1.0,371.0,JNU
1167282,YAK,AK,0.0,AS,1.0,271.0,JNU
1167283,YAK,AK,0.0,AS,1.0,603.0,ANC


Unnamed: 0,FL_DATE,OP_CARRIER,ORIGIN,ORIGIN_CITY_NAME,DESTINATION,DEST_CITY_NAME,DEP_DELAY,ARR_DELAY,CANCELLED,AIR_TIME,DISTANCE,OCCUPANCY_RATE
0,2019-03-02,WN,RSW,"Fort Myers, FL",CLE,"Cleveland, OH",-8.0,-6.0,0.0,,,0.970000
1,2019-03-02,WN,RSW,"Fort Myers, FL",CMH,"Columbus, OH",1.0,5.0,0.0,,,0.550000
2,2019-03-02,WN,RSW,"Fort Myers, FL",CMH,"Columbus, OH",0.0,4.0,0.0,,,0.910000
3,2019-03-02,WN,RSW,"Fort Myers, FL",CMH,"Columbus, OH",11.0,14.0,0.0,,,0.670000
4,2019-03-02,WN,RSW,"Fort Myers, FL",DAL,"Dallas, TX",0.0,-17.0,0.0,,,0.620000
...,...,...,...,...,...,...,...,...,...,...,...,...
1915881,2019-03-23,AA,TUL,"Tulsa, OK",CLT,"Charlotte, NC",-9.0,-6.0,0.0,,,0.794884
1915882,2019-03-24,AA,TUL,"Tulsa, OK",CLT,"Charlotte, NC",-2.0,-1.0,0.0,,,0.538399
1915883,2019-03-25,AA,TUL,"Tulsa, OK",CLT,"Charlotte, NC",-8.0,-25.0,0.0,,,0.955579
1915884,2019-03-26,AA,TUL,"Tulsa, OK",CLT,"Charlotte, NC",-9.0,-6.0,0.0,,,0.595344
