# 1. Import Necessary Libraries

In [5]:
import numpy as np

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

# 2. Loading the Training Data

In [7]:
file_path = r"/Users/karida/Desktop/AWS_SageMaker_Project/Data/train.csv"

train = pd.read_csv(file_path)

In [8]:
train.dtypes

airline             object
date_of_journey     object
source              object
destination         object
dep_time            object
arrival_time        object
duration             int64
total_stops        float64
additional_info     object
price                int64
dtype: object

In [16]:
train = train.assign(**{
    col: pd.to_datetime(train.loc[:, col], format = "mixed")
    for col in ["date_of_journey", "dep_time", "arrival_time"]
})
train.dtypes

airline                    object
date_of_journey    datetime64[ns]
source                     object
destination                object
dep_time           datetime64[ns]
arrival_time       datetime64[ns]
duration                    int64
total_stops               float64
additional_info            object
price                       int64
dtype: object

# 3. High-Level Analysis of Data

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   airline          6695 non-null   object        
 1   date_of_journey  6695 non-null   datetime64[ns]
 2   source           6695 non-null   object        
 3   destination      6695 non-null   object        
 4   dep_time         6695 non-null   datetime64[ns]
 5   arrival_time     6695 non-null   datetime64[ns]
 6   duration         6695 non-null   int64         
 7   total_stops      6694 non-null   float64       
 8   additional_info  6695 non-null   object        
 9   price            6695 non-null   int64         
dtypes: datetime64[ns](3), float64(1), int64(2), object(4)
memory usage: 523.2+ KB


In [23]:
train.describe(include = "number")

Unnamed: 0,duration,total_stops,price
count,6695.0,6694.0,6695.0
mean,633.489171,0.801464,9030.965646
std,503.006311,0.659737,4639.522857
min,75.0,0.0,1759.0
25%,170.0,0.0,5224.0
50%,510.0,1.0,8283.0
75%,922.5,1.0,12373.0
max,2860.0,3.0,62427.0


In [26]:
(
    train
    .assign(total_stops=train["total_stops"].astype(object))
    .describe(include = "O")
)

Unnamed: 0,airline,source,destination,total_stops,additional_info
count,6695,6695,6695,6694.0,6695
unique,9,5,6,4.0,8
top,Jet Airways,Delhi,Cochin,1.0,No_Info
freq,2391,2730,2730,3580.0,5248


# 4. High-Level Missing Values Analysis

In [41]:
def missing_values(data):
    miss_cols = [col for col in data.columns if data[col].isna().any()]
    miss_counts = [data[col].isna().isna().sum() for col in miss_cols]
    miss_percentage = [(data[col].isna().mean() * 100) for col in miss_cols]

    return (
        pd
        .DataFrame(data={
            'variable': miss_cols,
            'count': miss_counts,
            'percentage': miss_percentage
        })
        .sort_values(by='count', ascending=False)
        .set_index('variable')
    )

In [42]:
missing_values(train)

Unnamed: 0_level_0,count,percentage
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
total_stops,0,0.014937


# 5. High-Level Outlier Analysis