In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import seaborn as sns

df=pd.read_csv("flight.csv")
df.head(3)



The dataset comprises several key features, each serving a unique purpose:

- Airline: Within the "Airline" column, we find the names of various airline companies. This is a categorical feature with six different airlines represented.

- Flight: The "Flight" column contains information about the flight code, representing different flights. It's also a categorical feature.

- Source City: "Source City" denotes the city from which the flight originates. This categorical feature encompasses six distinct cities.

- Departure Time: This categorical feature is derived by categorizing departure times into different bins. It includes six unique time labels, providing insights into the timing of flights.

- Stops: The "Stops" feature is categorical and offers information about the number of stops between the source and destination cities, with three possible values.

- Arrival Time: Similar to "Departure Time," this derived categorical feature categorizes arrival times into specific bins. It features six distinct time labels, offering details on arrival timings.

- Destination City: This categorical feature indicates the city where the flight is scheduled to land. It comprises six unique city destinations.

- Class: The "Class" feature is categorical and captures information about the seat class. It distinguishes between "Business" and "Economy" classes.

- Duration: A continuous feature, "Duration," quantifies the total travel time between cities in hours.

- Days Left: Calculated as the difference between the trip date and the booking date, the "Days Left" feature is derived. It provides insights into the time gap between booking and travel.

- Price: This is the target variable, representing the ticket price.

These descriptions provide an overview of the dataset's features, each contributing distinct information relevant to flight data analysis.

#### Cleaning

In [None]:
#Cleaning un-necessary columns

# We do not need the Unnamed column nor the flight number for our analysis thus we drop them

df.drop(["Unnamed: 0","flight"],axis="columns",inplace=True)
df.head(3)




### Outlier detection and removal


In [None]:
def remove_outliers(df, s):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby(s):
        m = np.mean(subdf.price)
        st = np.std(subdf.price)
        reduced_df = subdf[(subdf.price > (m - st)) & (subdf.price <= (m + st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

df1 = remove_outliers(df, "airline")
df2 = remove_outliers(df, "class")

reduction = len(df) - len(df2)
percentage_reduction = (reduction / len(df)) * 100

print(f"In the outlier detection and removal process, {reduction} data points were removed.")
print(f"The length of the dataframe after outlier removal is: {len(df2)}")
print(f"The percentage of data reduction is: {percentage_reduction:.2f}%")


In [None]:
final=df2

# EXPLORATORY DATA ANALYSIS

In [None]:
#the shape of the final dataframe we are going to work with
print ("the shape of the final dataframe we are going to work with is : ",final.shape)


##### The data frame contains 10 attributes and 223076 rows in total and looks like:

In [None]:
final.head()

Let us get a summary of the DataFrame, including the number of non-null values in each column.


In [None]:
final.info()

###### As it is clearly noted, there are no null values in our dataset.

In [None]:
# Let us count the number of unique values in each column 
final.nunique()

## Our dataset contains 7 categorical variables

| Categorical variable     | Classes                                    |
|--------------------------|--------------------------------------------|
| Airline                  | Air_India, Vistara, SpiceJet, AirAsia, GO_FIRST, Indigo |
| Source City              | Delhi, Mumbai, Bangalore, Kolkata, Hyderabad, Chennai |
| Departure Time           | Evening, Night, Morning, Early_Morning, Afternoon, Late_Night |
| Stops                    | one, zero, two_or_more                      |
| Arrival Time             | Night, Afternoon, Evening, Morning, Late_Night, Early_Morning |
| Destination City         | Mumbai, Bangalore, Kolkata, Hyderabad, Chennai, Delhi |
| Class                    | Business, Economy                           |


In [None]:
# The summary statistics of the numeric columns in the data
final.describe()


**Initial Observations :Duration :**

- The dataset reveals an average (mean) flight duration of approximately 12.22 hours, reflecting the typical flight duration.

- Among the flights, the shortest recorded duration is just 0.83 hours, signifying exceptionally brief journeys, possibly within the same region.

- Conversely, the longest flight duration in the dataset extends to 49.83 hours, implying extended international or long-distance flights.

**Initial Observations :Days Before Booking:**
- On average, flight bookings occur approximately 26 days before the scheduled departure date, indicating a general trend of advanced planning.

- The dataset records the earliest booking made just 1 day before departure, indicating last-minute reservations, potentially due to urgent travel needs.

- In contrast, the dataset also includes instances where bookings were made as early as 49 days ahead of the departure date, reflecting meticulous planning or early reservations.

**Initial Observations :Price :**
- The dataset showcases an average flight price of around 20889.66 currency units, providing an overview of typical pricing for the flights.

- The lowest recorded price in the dataset stands at 1105 currency units, highlighting budget-friendly flight options.

- Conversely, the highest recorded price reaches 123071 currency units, pointing towards premium or long-haul flight options.

**Quartiles:**

- The 25th percentile of 'duration' stands at 6.83 hours, indicating that 25% of the flights in the dataset have a duration of 6.83 hours or less, representing relatively shorter flights.

- The 50th percentile, often referred to as the median, is 11.25 hours, signifying that half of the flights have a duration of 11.25 hours or less, portraying the typical duration.

- The 75th percentile of 'duration' is 16.17 hours, suggesting that 75% of the flights have a duration of 16.17 hours or less, with a subset involving longer journeys.


In [None]:
print("Summary Statistics of Price:")
print(final['price'].describe())
print()


However, since the data set has two highly correlated values Price, Class, lets calculate the statistics class wise

In [None]:
final_corr=final
final_corr['class'] = final_corr['class'].replace({'Economy': 0, 'Business': 1})

final[["class","price"]].corr()


In [None]:
# Calculate summary statistics for the "price" column class-wise
summary_stats = final.groupby("class")["price"].describe()

# Display the summary statistics
print(summary_stats)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

custom_palette = sns.color_palette("Set2")
plt.figure(figsize=(12, 6))
sns.histplot(final['price'], bins=30, kde=True, color='red')  # Change the color to red
plt.title('Distribution of Flight Prices', fontsize=16)
plt.xlabel('Price of Ticket in Rs.', fontsize=12)
plt.ylabel('Count of Tickets', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
sns.despine()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(final[final['class'] == "Economy"], x='price', bins=30, kde=True, color='red')
plt.title('Distribution of Flight Prices for Economy Class')
plt.xlabel('Price')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(final[final['class'] == "Business"], x='price', bins=30, kde=True, color='red')
plt.title('Distribution of Flight Prices for Business Class')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()


In [None]:
custom_palette = sns.color_palette("Set2")
plt.figure(figsize=(8, 6))
sns.boxplot(data=final, x="price", color=custom_palette[2], fliersize=5, linewidth=2, width=0.4)
plt.title('Box Plot of Price Distribution', fontsize=16)
plt.xlabel('Price of Ticket in Rs.', fontsize=12)
plt.xticks(fontsize=12)
sns.despine()
plt.show()


In [None]:
import plotly.express as px

fig = px.box(final, x="class", y="price", color="class",
             title='Box Plot of Price Distribution by Class',
             labels={"class": "Class", "price": "Price of Ticket in Rs."},
             width=800, height=800)

# fig.update_traces(marker=dict(size=5, line=dict(width=2)),
#                   boxmean=True)

fig.show()


In [None]:
px.box(final, x="airline", y="price", color="airline").update_layout(height=1800)

In [None]:

plt.figure(figsize=(7,3))
px.box(final,y="price",x="source_city",color="source_city").update_layout(
    xaxis_title="Source City", yaxis_title="Price of ticket"
)

In [None]:

plt.figure(figsize=(7,3))
px.box(final,y="price",x="destination_city",color="destination_city").update_layout(
    xaxis_title="Destination City", yaxis_title="Price of ticket"
)

In [None]:
import plotly.express as px

# Box plot for Departure Time
fig_departure = px.box(final, y="price", x="departure_time", color="departure_time")
fig_departure.update_layout(
    xaxis_title="Departure Time",
    yaxis_title="Price of Ticket",
    title="Box Plot of Price Distribution by Departure Time",
    width=700,
    height=350,
)


In [None]:
import plotly.express as px

# Box plot for Business Class - Departure Time
fig_business_departure = px.box(final[final['class'] == 'Business'], y="price", x="departure_time", color="departure_time")
fig_business_departure.update_layout(
    xaxis_title="Departure Time",
    yaxis_title="Price of Ticket",
    title="Box Plot of Price Distribution for Business Class by Departure Time",
    width=700,
    height=350,
)

# Box plot for Business Class - Arrival Time
fig_business_arrival = px.box(final[final['class'] == 'Business'], y="price", x="arrival_time", color="arrival_time")
fig_business_arrival.update_layout(
    xaxis_title="Arrival Time",
    yaxis_title="Price of Ticket",
    title="Box Plot of Price Distribution for Business Class by Arrival Time",
    width=700,
    height=350,
)

# Box plot for Economy Class - Departure Time
fig_economy_departure = px.box(final[final['class'] == 'Economy'], y="price", x="departure_time", color="departure_time")
fig_economy_departure.update_layout(
    xaxis_title="Departure Time",
    yaxis_title="Price of Ticket",
    title="Box Plot of Price Distribution for Economy Class by Departure Time",
    width=700,
    height=350,
)

# Box plot for Economy Class - Arrival Time
fig_economy_arrival = px.box(final[final['class'] == 'Economy'], y="price", x="arrival_time", color="arrival_time")
fig_economy_arrival.update_layout(
    xaxis_title="Arrival Time",
    yaxis_title="Price of Ticket",
    title="Box Plot of Price Distribution for Economy Class by Arrival Time",
    width=700,
    height=350,
)

fig_business_departure.show()
fig_business_arrival.show()
fig_economy_departure.show()
fig_economy_arrival.show()


In [None]:
most_common_airline = df['airline'].mode().values[0]

plt.figure(figsize=(10, 6))
sns.set(style="whitegrid")

for airline in df['airline'].unique():
    color = 'blue' if airline == most_common_airline else 'lightgray'
    sns.histplot(data=df[df['airline'] == airline], x="airline", binwidth=20, color=color, edgecolor='black')

plt.title('Histogram of Airlines', fontsize=16)
plt.xlabel('Airline', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, fontsize=10)

legend_labels = ['Most Common Airline', 'Other Airlines']
legend_colors = ['skyblue', 'lightgray']
legend_patches = [plt.Rectangle((0, 0), 1, 1, color=color) for color in legend_colors]
plt.legend(legend_patches, legend_labels, title='Legend', fontsize=10)

plt.grid(axis='y', linestyle='--', alpha=0.7)
sns.despine()

plt.show()


In [None]:
plt.figure(figsize=(8, 8))
plt.title('Airline Distribution')

airline_counts = final['airline'].value_counts()

plt.pie(airline_counts, labels=airline_counts.index, autopct='%1.1f%%', startangle=160)

plt.show()


In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(x='airline', data=final, palette='cubehelix')
plt.title('Flight Counts by Airline')
plt.xlabel('Airline')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(15,3))
sns.lineplot(data=df,x='days_left', y='price',color="red")
plt.xlabel("Days left before booking")
plt.ylabel("Price of ticket")

plt.grid(True)

In [None]:
plt.figure(figsize=(15,3))
sns.lineplot(data=final,x='days_left', y='price',hue="airline")

In [None]:
contingency_table = pd.crosstab(final['airline'], final['class'])

plt.figure(figsize=(10, 6))
sns.heatmap(contingency_table, annot=True, cmap='Blues')
plt.title('Contingency Table: Airline vs. Class')
plt.xlabel('Class')
plt.ylabel('Airline')
plt.xticks(rotation=45)
plt.show()

In [None]:
import sweetviz
report=sweetviz.analyze(final)

In [None]:
report.show_html()

In [None]:
rep2=sweetviz.analyze([final,"Flight Price Data"],target_feat="price")

In [None]:
rep2.show_html()