# Loading Datasets

In [None]:
import pandas as pd
import openpyxl
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_excel("data/Data_Train.xlsx", engine="openpyxl")
df.sample(5)

# Data inspection

In [None]:
df.shape # shape of the DataFrame (row, columns)

In [None]:
df.isnull().any() # checks if the DataFrame columns contains the null value of not

In [None]:
df.isna().sum() # gives total number of missing values(null) in the columns

In [None]:
df.isna().sum() / len(df) # gives percentage of the missing values of the columns

In [None]:
df.dropna(inplace=True) # since only 2 rows contains missing values so deleting the rows which contains the missing values
df.reset_index(inplace=True, drop=True) # resetting index of the dataframe as index from the middle is dropped

In [None]:
df.isnull().any() # after deleting checking the missing value if there is any

In [None]:
df.dtypes # checking the datatypes of all the columns

In [None]:
df.info() # checking the info of the DataFrame

In [None]:
df.describe() # overview of the numerical data

In [None]:
df.describe(include="object") # overview of the categorical data

In [None]:
df.sample(2)

In [None]:
day = []
month = []
for date in df["Date_of_Journey"]: # looping through date_of_journey to get day and month
    list_date = date.split("/") # splitting the date using "/" to get the form of ["9", "05", "2019"]
    day.append(list_date[0]) # appending 0 index data to day 
    month.append(list_date[1]) # appending 1 index data to month list

# month

In [None]:
df["Day"] = day # inserting day columns 
df["Month"] = month # inserting month columns in the dataframe

In [None]:
df.head()

In [None]:
df.drop("Date_of_Journey", axis=1, inplace=True) # deleting Date of journey columns form the dataframe

In [None]:
df.head(2)

In [None]:
# Extracting hour and minute from the Departure time (Dep_Time) from the dataframe and dumping it in new columns (Dep_Hour, Dep_Minute)
dep_hrs = []
dep_min = []
for time in df["Dep_Time"]:
    # print(time)
    list_time = time.split(":")
    # print(list_time)
    dep_hrs.append(list_time[0])
    dep_min.append(list_time[1])
# dep_hrs
# dep_min

In [None]:
# Adding columns Dep_Hours and Dep_Minute in the DataFrame
df["Dep_Hours"] = dep_hrs
df["Dep_Minutes"] = dep_min

In [None]:
df.head(2)

In [None]:
# Deleting Dep_Time columns from the dataframe
df.drop("Dep_Time", axis="columns", inplace=True)
df.head(2)

In [None]:
df.dtypes

In [None]:
# Extracting hour and minute from the Arrival time (Arrival_Time) from the dataframe and dumping it in new columns (Arrival_Hour, Arrival_Minute)
df["Arrival_Hour"] = pd.DatetimeIndex(df["Arrival_Time"]).hour
df["Arrival_Minute"] = pd.DatetimeIndex(df["Arrival_Time"]).minute
df.drop("Arrival_Time", axis=1, inplace=True)

In [None]:
df.head()

In [None]:
# converting Duration in the format with out alphabets 
td = pd.to_timedelta(df["Duration"])
td

In [None]:
# extracting duration hour and duration minute from the time delta(td)
dur_hr = []
dur_min = []
for i in range(len(td)): # loop using the length of timedelta
    seconds = td[i].seconds  # taking out the time in the format of seconds 
    minutes = seconds / 60  # total minutes in the duration
    hour = minutes // 60 #  # converting minutes in hours with the help of floor division
    minute = minutes % 60 # # minutes
    dur_hr.append(hour)  # appending the hour in duration hour list
    dur_min.append(minute) # appending the minute in duration minute list

dur_min[:5]

In [None]:
# Adding new column in the dataframe Duration_Hour and Duration_Minute
df["Duration_Hour"] = dur_hr
df["Duration_Minute"] = dur_min

In [None]:
# converting datatype of the columns in the int format using astype method
df[["Day", "Month", "Dep_Hours", "Dep_Minutes", "Duration_Hour", "Duration_Minute"]] = df[["Day", "Month", "Dep_Hours", "Dep_Minutes", "Duration_Hour", "Duration_Minute"]].astype("int32")

In [None]:
df.head(2)

In [None]:
df.dtypes

In [None]:
# dropping Duration columns from the dataframe as hour and minute are already extracted from the duration
df.drop("Duration", axis=1, inplace=True)

In [None]:
df.head(2)

In [None]:
# Working with the duration column
l_route = []
for item in df["Route"]:
    list = item.split("→")
    l_route.append(list)

In [None]:
df["Route"] = l_route
df.head(2)

In [None]:
# Working with Total stops columns in the dataframe
df["Total_Stops"].unique()

In [None]:
# mapping Total stops value with numerical values
stop_mapping = {"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}
df["Total_Stops"] = df["Total_Stops"].str.strip().map(stop_mapping)
df.head(2)

In [None]:
df.Additional_Info.unique()

In [None]:
plt.figure(figsize=(14, 8))
sns.countplot(x="Additional_Info", data=df)

In [None]:
df.groupby(["Total_Stops"], as_index=False).Price.mean()

In [None]:
sns.countplot(x=df["Total_Stops"])