In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/easemytrip-flight-fare-details-2020/marketing_sample_for_easemytrip_in-easemytrip_flight_fares_data__20200101_20200229__30k_data.csv')
df.head()

In [None]:
df.drop(['Uniq Id', 'Crawl Timestamp'], axis=1, inplace=True)

### Dropping the UniqueID and Crawl TimeStamp since those are not informational

In [None]:
df.head()

In [None]:
df.info()

### By observing the Info of the Data, it's understood that nearly 28K of 30K flights have 1 Layover while there are only 7 flights with 3 layovers

## Converting the Departure Date and Arrival Date columns to Date time format

In [None]:
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Jan", "01"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Feb", "02"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Mar", "03"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Apr", "04"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("May", "05"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Jun", "06"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Jul", "07"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Aug", "08"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Sep", "09"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Oct", "10"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Nov", "11"))
df["Departure Date"] = df["Departure Date"].apply(lambda x: x.replace("Dec", "12"))

In [None]:
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Jan", "01"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Feb", "02"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Mar", "03"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Apr", "04"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("May", "05"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Jun", "06"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Jul", "07"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Aug", "08"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Sep", "09"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Oct", "10"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Nov", "11"))
df["Arrival Date"] = df["Arrival Date"].apply(lambda x: x.replace("Dec", "12"))

In [None]:
df['Arrival Date'] = pd.to_datetime(df['Arrival Date'], format='%d%m%Y')
df['Departure Date'] = pd.to_datetime(df['Departure Date'], format='%d%m%Y')

## Calculating the total journey time in hrs for all flights

In [None]:
total_time_hrs_lst = []
from tqdm.notebook import tqdm
for index,row in tqdm(df.iterrows()):
    total_time = row['Total Time']
    a = [float(n.replace('m','')) for n in total_time.split('h')]
    total_time_hrs = round(float(a[0] + (a[1]/60)),2)
    total_time_hrs_lst.append(total_time_hrs)

In [None]:
df['Total Time Hrs'] = total_time_hrs_lst

## Analysing the source column

In [None]:
df['Source'].value_counts().head(30)

In [None]:
df['Source'].value_counts().tail(30)

In [None]:
ax = pd.DataFrame(df['Source'].value_counts()).head(50).plot(kind = 'bar',figsize=(15,10))
ax.set_title('Number of flights from each source')
ax.set_xlabel('Source')
ax.set_ylabel('Number of Flights')
start, end = ax.get_ylim()
z = ax.yaxis.set_ticks(np.arange(start, end, 200))

### 1. It is observed that there are more number of flights from Mumbai when compared to other places
### 2. The next Source station that has more number of flights are Bengaluru, Ahmedabad, Kolkata, Delhi and Dubai
### 3. All the Metro Cities in India has more number of flights and it's true because those cities have international airports
### 4. At the same time all the cities that have less number of flights starting from them are Tier 3 Cities and Towns in India

## Analysing the Destination Column

In [None]:
df['Destination'].value_counts().head(30)

In [None]:
df['Destination'].value_counts().tail(30)

In [None]:
ax = pd.DataFrame(df['Destination'].value_counts()).head(50).plot(kind = 'bar',figsize=(15,10))
ax.set_title('Number of flights to each destination')
ax.set_xlabel('Destination')
ax.set_ylabel('Number of Flights')
start, end = ax.get_ylim()
z = ax.yaxis.set_ticks(np.arange(start, end, 200))

### 1. There are 4.5K Flights with Delhi as Destination
### 2. All the Top Cities in India has more number of flights as Destination and it's true because those cities have international airports and also the main cities in India
### 3. At the same time all the cities that have less number of flights starting from them are Tier 3 Cities and Towns in India

## Analysing the Source and Number of Stops

In [None]:
d = df.groupby('Source').sum().reset_index()
d.sort_values(by=['Number Of Stops'], ascending=False, inplace=True)

In [None]:
d.head(20)

In [None]:
d.tail(30)

In [None]:
fig_dims = (20, 10)
fig, ax = plt.subplots(figsize=fig_dims)
ax = sns.barplot(x=d['Source'].head(20), y=d['Number Of Stops'].head(20), ax=ax)
ax.set_title('Number of LayOvers a Source Station has in the given period')
ax.set_xlabel('Source')
ax.set_ylabel('Number of Layovers')
start, end = ax.get_ylim()
z = ax.yaxis.set_ticks(np.arange(start, end, 200))

### Mumbai is having most number of flights that has more layovers
### The Flights from top cities in India are having more number of layover's because flights from these places go for long distance connecting some intermediate places aswell.

## Analysing Flight Operator Data

In [None]:
flights = df['Flight Operator'].values
flights_lst = list(flights)

In [None]:
from tqdm.notebook import tqdm
flight_operators = set()
for i in flights_lst:
    j = i.split('|')
    for k in j:
        flight_operators.add(k)
flight_operator_count = {}
for i in tqdm(flight_operators):
    temp = 0 
    for j in flights_lst:
        if i in j:
            temp += 1
    flight_operator_count[i] = temp

In [None]:
operator_data = pd.DataFrame(list(flight_operator_count.items()), columns=['Operator', 'Number Of Flights'])

In [None]:
operator_data.sort_values(by = 'Number Of Flights', ascending=False, inplace=True)

In [None]:
operator_data.head(10)

In [None]:
import plotly.express as px
px.bar(x="Operator", y = "Number Of Flights", data_frame=operator_data.head(20))

### It is observed that Air India has operated more number of flights (approx. 11K) than any other flight operators. While Indigo has got the 2nd place with (approx. 4.2K flights).

In [None]:
df.head()

## Analysing Total Travel Time Data 

In [None]:
df['Total Time Hrs'].describe()

In [None]:
df[df['Total Time Hrs'] == 68.58]

In [None]:
df[df['Total Time Hrs'] == 0.67]

It is observed that All the flights with duration less than 1 hour are near by cities. So the travel time is less