In [29]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import eda_helper_functions
from sklearn.ensemble import IsolationForest

- Reading Data

In [30]:
DATA_DIR=r"C:\SUFIYAN\STUDY MATERIALS\FOR CLASS\sagemaker-flights-price-prediction\data"
def read_data(data):
    path=os.path.join(DATA_DIR,data)
    return pd.read_csv(path)

In [31]:
df=read_data('train.csv')

In [32]:
df.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,IndiGo,2019-06-12,Delhi,Cochin,10:35:00,01:30:00,895.0,1,No Info,5883
1,IndiGo,2019-06-18,Bengaluru,Delhi,10:10:00,13:00:00,170.0,0,No Info,3943
2,Jet Airways,2019-05-15,Kolkata,Bengaluru,09:35:00,10:55:00,1520.0,1,No Info,14151
3,Jet Airways,2019-06-03,Delhi,Cochin,19:45:00,19:00:00,1395.0,1,In-flight meal not included,10262
4,IndiGo,2019-05-18,Kolkata,Bengaluru,15:30:00,18:05:00,155.0,0,No Info,4804


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    float64
 7   total_stops      640 non-null    int64  
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [34]:
df.describe()

Unnamed: 0,duration,total_stops,price
count,640.0,640.0,640.0
mean,638.539062,0.83125,9074.9875
std,492.270234,0.678043,4835.381205
min,75.0,0.0,1965.0
25%,170.0,0.0,5101.25
50%,560.0,1.0,8286.5
75%,891.25,1.0,12122.5
max,2315.0,3.0,46490.0


In [35]:
df.dtypes

airline             object
date_of_journey     object
source              object
destination         object
dep_time            object
arrival_time        object
duration           float64
total_stops          int64
additional_info     object
price                int64
dtype: object

In [36]:
df=df.assign(**{
    col: pd.to_datetime(df.loc[:,col], dayfirst=True,format='mixed')
    for col in ['date_of_journey', 'dep_time', 'arrival_time']
})


In [37]:
df.dtypes

airline                    object
date_of_journey    datetime64[ns]
source                     object
destination                object
dep_time           datetime64[ns]
arrival_time       datetime64[ns]
duration                  float64
total_stops                 int64
additional_info            object
price                       int64
dtype: object

# High-Level Summary of Data

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   airline          640 non-null    object        
 1   date_of_journey  640 non-null    datetime64[ns]
 2   source           640 non-null    object        
 3   destination      640 non-null    object        
 4   dep_time         640 non-null    datetime64[ns]
 5   arrival_time     640 non-null    datetime64[ns]
 6   duration         640 non-null    float64       
 7   total_stops      640 non-null    int64         
 8   additional_info  640 non-null    object        
 9   price            640 non-null    int64         
dtypes: datetime64[ns](3), float64(1), int64(2), object(4)
memory usage: 50.1+ KB


In [39]:
df.describe(include='number')

Unnamed: 0,duration,total_stops,price
count,640.0,640.0,640.0
mean,638.539062,0.83125,9074.9875
std,492.270234,0.678043,4835.381205
min,75.0,0.0,1965.0
25%,170.0,0.0,5101.25
50%,560.0,1.0,8286.5
75%,891.25,1.0,12122.5
max,2315.0,3.0,46490.0


In [40]:
(
    df
    .assign(total_stops=df['total_stops'].astype(object))
    .describe(include='O')
)

Unnamed: 0,airline,source,destination,total_stops,additional_info
count,640,640,640,640,640
unique,12,5,6,4,6
top,Jet Airways,Delhi,Cochin,1,No Info
freq,236,269,269,337,497


# High-Level Analysis of Missing Values

In [41]:
temp=pd.read_csv('C:/SUFIYAN/STUDY MATERIALS/FOR CLASS/FOR CLASS/Basics/train.csv')

In [43]:
a=[col for col in temp.columns if temp[col].isna().any()]

In [44]:
b=[temp[a].isna().sum()]
b

[Age         177
 Cabin       687
 Embarked      2
 dtype: int64]

In [45]:
c=[temp[a].isna().mean()*100]
c

[Age         19.865320
 Cabin       77.104377
 Embarked     0.224467
 dtype: float64]

In [46]:
def missing_in(data):
    na_cols=[col for col in data.columns if data[col].isna().any()]
    na_counts=[data[col].isna().sum() for col in na_cols]
    na_pct=[data[col].isna().mean()*100 for col in na_cols]


    return(
        pd
        .DataFrame(data={
            'Columns':na_cols,
            'Counts':na_counts,
            'Percentage':na_pct
        }).sort_values(by='Counts',ascending=False)
        .set_index('Columns')
    )

In [53]:
missing_in(temp)

Unnamed: 0_level_0,Counts,Percentage
Columns,Unnamed: 1_level_1,Unnamed: 2_level_1
Cabin,687,77.104377
Age,177,19.86532
Embarked,2,0.224467


# High-Level Analysis of Outliers

In [48]:
forest=IsolationForest(random_state=42)

In [49]:
(
    df
    .assign(outlier=forest.fit_predict(df
                                      .drop(columns='price')
                                      .select_dtypes(include='number')))
    .query('outlier==-1')
    .sort_values(by='duration',ascending=False)
)

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price,outlier
527,Jet Airways,2019-03-27,Delhi,Cochin,2024-07-15 05:10:00,2024-07-15 19:45:00,2315.0,2,In-flight meal not included,8938,-1
524,Jet Airways,2019-03-27,Delhi,Cochin,2024-07-15 06:20:00,2024-07-15 19:45:00,2245.0,2,In-flight meal not included,9296,-1
592,Air India,2019-03-03,Delhi,Cochin,2024-07-15 05:55:00,2024-07-15 19:15:00,2240.0,2,No Info,13748,-1
102,Air India,2019-03-03,Bengaluru,New Delhi,2024-07-15 11:05:00,2024-07-15 22:10:00,2105.0,2,No Info,11948,-1
559,Air India,2019-03-03,Bengaluru,New Delhi,2024-07-15 11:05:00,2024-07-15 22:10:00,2105.0,2,No Info,11791,-1
...,...,...,...,...,...,...,...,...,...,...,...
541,IndiGo,2019-03-21,Mumbai,Hyderabad,2024-07-15 09:10:00,2024-07-15 10:35:00,85.0,0,No Info,4049,-1
422,IndiGo,2019-03-21,Mumbai,Hyderabad,2024-07-15 06:20:00,2024-07-15 07:45:00,85.0,0,No Info,4049,-1
436,Air India,2019-03-21,Mumbai,Hyderabad,2024-07-15 21:05:00,2024-07-15 22:25:00,80.0,0,No Info,2050,-1
93,Air India,2019-06-24,Mumbai,Hyderabad,2024-07-15 06:20:00,2024-07-15 07:40:00,80.0,0,No Info,3100,-1


# Automated EDA Report

In [50]:
from ydata_profiling import ProfileReport

In [55]:
report=ProfileReport(df)

In [57]:
report.to_file(output_file='output.html')

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]