In [42]:
# required python libraries for data
import numpy as np
import pandas as pd
import re 

In [4]:
# import dataset and read the data
tickets_df = pd.read_csv("Tickets.csv")
# Taking a backup of the original data
tickets_backup = tickets_df.copy()
# head returns the first 5 rows of the Tickets.csv data to get an idea of the data
tickets_df.head(5) 

Unnamed: 0,ITIN_ID,YEAR,QUARTER,ORIGIN,ORIGIN_COUNTRY,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,ROUNDTRIP,REPORTING_CARRIER,PASSENGERS,ITIN_FARE,DESTINATION
0,201912723049,2019,1,ABI,US,TX,Texas,1.0,MQ,1.0,736.0,DAB
1,201912723085,2019,1,ABI,US,TX,Texas,1.0,MQ,1.0,570.0,COS
2,201912723491,2019,1,ABI,US,TX,Texas,1.0,MQ,1.0,564.0,MCO
3,201912723428,2019,1,ABI,US,TX,Texas,1.0,MQ,1.0,345.0,LGA
4,201912723509,2019,1,ABI,US,TX,Texas,0.0,MQ,1.0,309.0,MGM


In [7]:
# understand the shape of the dataset with the no. of rows and cols
tickets_df.shape
# checking for index
tickets_df.index.values

array([      0,       1,       2, ..., 1167282, 1167283, 1167284])

In [13]:
# Print the information, shape and data type for the data frame
tickets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167285 entries, 0 to 1167284
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   ITIN_ID            1167285 non-null  int64  
 1   YEAR               1167285 non-null  int64  
 2   QUARTER            1167285 non-null  int64  
 3   ORIGIN             1167285 non-null  object 
 4   ORIGIN_COUNTRY     1167285 non-null  object 
 5   ORIGIN_STATE_ABR   1167285 non-null  object 
 6   ORIGIN_STATE_NM    1167285 non-null  object 
 7   ROUNDTRIP          1167285 non-null  float64
 8   REPORTING_CARRIER  1167285 non-null  object 
 9   PASSENGERS         1165308 non-null  float64
 10  ITIN_FARE          1166325 non-null  object 
 11  DESTINATION        1167285 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 106.9+ MB


In [21]:
# Excluding cols that are unnecessary to the dataset
drop_cols = ['ITIN_ID','YEAR','QUARTER','ORIGIN_COUNTRY','ORIGIN_STATE_NM']
tickets = tickets_df.drop(drop_cols, axis = 1)

In [22]:
# Determine the count of unique values in each col in the data frame
tickets.nunique()

ORIGIN                419
ORIGIN_STATE_ABR       52
ROUNDTRIP               2
REPORTING_CARRIER      21
PASSENGERS            286
ITIN_FARE            3791
DESTINATION           410
dtype: int64

In [31]:
# Checking any rows are missing any data
tickets.isnull().sum()

ORIGIN                  0
ORIGIN_STATE_ABR        0
ROUNDTRIP               0
REPORTING_CARRIER       0
PASSENGERS           1977
ITIN_FARE             960
DESTINATION             0
dtype: int64

In [86]:
# Function to remove random characters in cols
 def clean_value(value):
     return re.sub(r'[^0-9]+','', str(value))

tickets['ITIN_FARE'] = tickets['ITIN_FARE'].apply(lambda x: clean_value(x))


0          False
1          False
2          False
3          False
4          False
           ...  
1167280    False
1167281    False
1167282    False
1167283    False
1167284    False
Name: ITIN_FARE, Length: 1167285, dtype: bool

In [48]:
# Filling the missing values in the Passengers and ITIN FARES
tickets = tickets.fillna(0)

In [57]:
# As it can be seen from the information above:
# Roundtrip col comprising of 0 and 1 should be converted to type bool from type float
# Passengers col being of type float does not make sense and should be converted to type int
# Iternary fare col is of type object which does not make sense as it describes fare and should be converted to type float
# implementing these conversions

tickets.astype({'ROUNDTRIP':'bool'})
tickets['PASSENGERS'] = tickets['PASSENGERS'].astype('int64')
tickets['ITIN_FARE'] = tickets['ITIN_FARE'].astype('float64')

In [59]:
tickets['ITIN_FARE2'].value_counts()


0.0    1167285
Name: ITIN_FARE, dtype: int64