# Merging datasets

To move forward with the analysis we need to merge the datasets selected. 

In [2]:
# Import libs
import pandas as pd
import numpy as np

In [4]:
# Read database 1 and 2
clean_f1 = pd.read_csv("https://raw.githubusercontent.com/r41ss4/rennes_da/refs/heads/main/data/cleaned/clean_dataf1.csv")
clean_f2 = pd.read_csv("https://raw.githubusercontent.com/r41ss4/rennes_da/refs/heads/main/data/cleaned/clean_dataf2.csv")

In [6]:
# Review dataset
clean_f1.head(5)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Sourke
0,IndiGo,24/03/2019,Banglore,Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,Banglore
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,Kolkata
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,Delhi
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,non-stop,No info,6218,Kolkata
4,Airline,01/03/2019,Banglore,Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,Banglore


In [7]:
# Review dataset
clean_f2.head(5)

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [8]:
# Normalice names in common columns among datasets
clean_f1 = clean_f1.rename(columns={'Airline': 'airline', 'Price': 'price', 
                                    'Destination': 'destination_city', 'Source': 'source_city',
                                    'Duration': 'duration',  'Total_Stops': 'specific_stops'})
# Review dataset
clean_f1.head(5)

Unnamed: 0,airline,Date_of_Journey,source_city,destination_city,Route,Dep_Time,Arrival_Time,duration,specific_stops,Additional_Info,price,Sourke
0,IndiGo,24/03/2019,Banglore,Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,Banglore
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,Kolkata
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,Delhi
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,non-stop,No info,6218,Kolkata
4,Airline,01/03/2019,Banglore,Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,Banglore


In [9]:
# Review how to normalice stops ('specific_stops') column
clean_f1['specific_stops'].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', '4 stops'],
      dtype=object)

In [10]:
# Review how to normalice stops column
clean_f2['stops'].unique()

array(['zero', 'one', 'two_or_more'], dtype=object)

### Normalizing 'stops' column
#### Two columns and mode
This column has different possible values in both datasets, in clean_f2 it can only be 'zero', 'one', 'two_or_more', while in clean_f1 it can be 'non-stop', '2 stops', '1 stop', '3 stops', '4 stops'. It is forcing both columns to completely match means either altering the values of clean_f2 with the mode or losing details from clean_f1. Therefore, it is possible to conserve both by creating two columns, 'specific_stops' and 'general_stops'. 
The idea is to keep the specific stops from clean_f1 while filling the missing info with the mode in the column 'specific_stops', while all rows with values equal to '2 stops', '3 stops' or '4 stops' in clean_f1 will get the value of 'two_or_more' in the column 'general_stops'

In [12]:
# Create general_stops column in clean_f1
clean_f1['general_stops'] = clean_f1['specific_stops'].replace({
    'non-stop': 'zero',
    '1 stop': 'one',
    '2 stops': 'two_or_more',
    '3 stops': 'two_or_more',
    '4 stops': 'two_or_more'
})

In [13]:
# Review dataset
clean_f1.head(5)

Unnamed: 0,airline,Date_of_Journey,source_city,destination_city,Route,Dep_Time,Arrival_Time,duration,specific_stops,Additional_Info,price,Sourke,general_stops
0,IndiGo,24/03/2019,Banglore,Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,Banglore,zero
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,Kolkata,two_or_more
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,Delhi,two_or_more
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,non-stop,No info,6218,Kolkata,zero
4,Airline,01/03/2019,Banglore,Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,Banglore,one


In [14]:
# Correct clean_f2 column name
clean_f2 = clean_f2.rename(columns={'stops': 'general_stops'})
# Review dataset
clean_f2.head(5)

Unnamed: 0,airline,flight,source_city,departure_time,general_stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [15]:
# Create a new column specific_stops in clean_f2
clean_f2['specific_stops'] = clean_f2['general_stops']
# Review dataset
clean_f2.head(5)

Unnamed: 0,airline,flight,source_city,departure_time,general_stops,arrival_time,destination_city,class,duration,days_left,price,specific_stops
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953,zero
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953,zero
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956,zero
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955,zero
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955,zero


In [16]:
# Calculate the mode of specific stops for each route in clean_f1
mode_stops = clean_f1.groupby(['source_city', 'destination_city'])['specific_stops'].agg(lambda x: x.mode().iloc[0])
mode_stops

source_city  destination_city
Banglore     Delhi               non-stop
Chennai      Kolkata             non-stop
Delhi        Cochin                1 stop
Kolkata      Banglore              1 stop
Mumbai       Hyderabad           non-stop
Name: specific_stops, dtype: object

In [17]:
# Review how to normalice stops column
clean_f2['general_stops'].unique()

array(['zero', 'one', 'two_or_more'], dtype=object)

In [18]:
clean_f1

Unnamed: 0,airline,Date_of_Journey,source_city,destination_city,Route,Dep_Time,Arrival_Time,duration,specific_stops,Additional_Info,price,Sourke,general_stops
0,IndiGo,24/03/2019,Banglore,Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,Banglore,zero
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,Kolkata,two_or_more
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,Delhi,two_or_more
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,non-stop,No info,6218,Kolkata,zero
4,Airline,01/03/2019,Banglore,Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,Banglore,one
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11746,Air Asia,27/05/2019,Banglore,Delhi,BLR ? DEL,23:55,02:45 28 May,2h 50m,non-stop,No info,3383,Banglore,zero
11747,Air India,1/06/2019,Delhi,Cochin,DEL ? GOI ? BOM ? COK,22:00,19:15 02 Jun,21h 15m,2 stops,No info,10441,Delhi,two_or_more
11748,Jet Airways,24/05/2019,Kolkata,Banglore,CCU ? BOM ? BLR,14:05,22:35,8h 30m,1 stop,In-flight meal not included,10844,Kolkata,one
11749,Jet Airways,6/05/2019,Kolkata,Banglore,CCU ? BOM ? BLR,18:55,10:05 07 May,15h 10m,1 stop,No info,13584,Kolkata,one


#### Problems with stops and mode
As the error shown above and that the mode_stops has no values different from non-stop and 1 stop, it reflex that there are no relevant flights reflected in clean_f2 with greater value mode different that 'non-stop' and '1 stop', which means it is un necessary to use the mode to fulfill specific_stops in clean_f2. 

In [20]:
clean_f2['specific_stops'] = clean_f2['general_stops'].replace({
    'zero': 'non-stop',
    'one': '1 stop',
    'two_or_more': '2 stops'  # Default to 2 stops for simplicity
})
clean_f2

Unnamed: 0,airline,flight,source_city,departure_time,general_stops,arrival_time,destination_city,class,duration,days_left,price,specific_stops
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953,non-stop
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953,non-stop
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956,non-stop
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955,non-stop
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955,non-stop
...,...,...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,UK-822,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,69265,1 stop
300149,Vistara,UK-826,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49,77105,1 stop
300150,Vistara,UK-832,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49,79099,1 stop
300151,Vistara,UK-828,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.00,49,81585,1 stop


In [21]:
# Review how to normalice stops column
clean_f2['specific_stops'].unique()

array(['non-stop', '1 stop', '2 stops'], dtype=object)

### Normalizing 'duration' column
This column in clean_f1 is set as an object, having hours with h and minutes with m, while in clean_f2 it is a float numbers, where the decimals represent the minutes. Therefore, it is more practical to change the duration in clean_f1 into a similar format to clean_f2 

In [23]:
# Modify duration in clean
clean_f1['duration'] = clean_f1['duration'].str.replace('h ', '.')
clean_f1['duration'] = clean_f1['duration'].str.replace('h', '')
clean_f1['duration'] = clean_f1['duration'].str.replace('m', '')
# Review changes 
clean_f1.head(10)

Unnamed: 0,airline,Date_of_Journey,source_city,destination_city,Route,Dep_Time,Arrival_Time,duration,specific_stops,Additional_Info,price,Sourke,general_stops
0,IndiGo,24/03/2019,Banglore,Delhi,BLR ? DEL,22:20,01:10 22 Mar,2.5,non-stop,No info,3897,Banglore,zero
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7.25,2 stops,No info,7662,Kolkata,two_or_more
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19.0,2 stops,No info,13882,Delhi,two_or_more
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5.25,non-stop,No info,6218,Kolkata,zero
4,Airline,01/03/2019,Banglore,Delhi,BLR ? NAG ? DEL,16:50,21:35,4.45,1 stop,No info,13302,Banglore,one
5,SpiceJet,24/06/2019,Kolkata,Banglore,CCU ? BLR,09:00,11:25,2.25,non-stop,No info,3873,Kolkata,zero
6,Airline,12/03/2019,Banglore,Delhi,BLR ? BOM ? DEL,18:55,10:25 13 Mar,15.3,1 stop,In-flight meal not included,11087,Banglore,one
7,Jet Airways,01/03/2019,Banglore,Delhi,BLR ? BOM ? DEL,08:00,05:05 02 Mar,21.5,1 stop,No info,22270,Banglore,one
8,Jet Airways,12/03/2019,Banglore,Delhi,BLR ? BOM ? DEL,08:55,10:25 13 Mar,25.3,1 stop,In-flight meal not included,11087,Banglore,one
9,Multiple carriers,27/05/2019,Delhi,Cochin,DEL ? BOM ? COK,11:25,19:15,7.5,1 stop,No info,8625,Delhi,one


In [24]:
# Turn duration into float
clean_f1['duration'] = clean_f1['duration'].astype(float)

In [25]:
# Rewiew changes
clean_f1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11751 entries, 0 to 11750
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           11751 non-null  object 
 1   Date_of_Journey   11751 non-null  object 
 2   source_city       11751 non-null  object 
 3   destination_city  11751 non-null  object 
 4   Route             11751 non-null  object 
 5   Dep_Time          11751 non-null  object 
 6   Arrival_Time      11751 non-null  object 
 7   duration          11751 non-null  float64
 8   specific_stops    11751 non-null  object 
 9   Additional_Info   11751 non-null  object 
 10  price             11751 non-null  int64  
 11  Sourke            11751 non-null  object 
 12  general_stops     11751 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.2+ MB


### Normalizing 'Dep_Time' column

In [27]:
# Function to categorize time into time of day
def categorize_dep(time):
    # Split the time string to get the hour part
    hour = int(time.split(':')[0])
    
    # Categorize the hour into time of day
    if 0 <= hour < 6:
        return 'Early_Morning'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 24:
        return 'Evening'
    else:
        return time  # Return the original time if it doesn't match any category

In [28]:
# Apply the categorization to the 'Dep_Time' on clean_f1
clean_f1['Dep_Time'] = clean_f1['Dep_Time'].apply(categorize_dep)
# Review changes 
clean_f1.head(10)

Unnamed: 0,airline,Date_of_Journey,source_city,destination_city,Route,Dep_Time,Arrival_Time,duration,specific_stops,Additional_Info,price,Sourke,general_stops
0,IndiGo,24/03/2019,Banglore,Delhi,BLR ? DEL,Evening,01:10 22 Mar,2.5,non-stop,No info,3897,Banglore,zero
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,Early_Morning,13:15,7.25,2 stops,No info,7662,Kolkata,two_or_more
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,Morning,04:25 10 Jun,19.0,2 stops,No info,13882,Delhi,two_or_more
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,Evening,23:30,5.25,non-stop,No info,6218,Kolkata,zero
4,Airline,01/03/2019,Banglore,Delhi,BLR ? NAG ? DEL,Afternoon,21:35,4.45,1 stop,No info,13302,Banglore,one
5,SpiceJet,24/06/2019,Kolkata,Banglore,CCU ? BLR,Morning,11:25,2.25,non-stop,No info,3873,Kolkata,zero
6,Airline,12/03/2019,Banglore,Delhi,BLR ? BOM ? DEL,Evening,10:25 13 Mar,15.3,1 stop,In-flight meal not included,11087,Banglore,one
7,Jet Airways,01/03/2019,Banglore,Delhi,BLR ? BOM ? DEL,Morning,05:05 02 Mar,21.5,1 stop,No info,22270,Banglore,one
8,Jet Airways,12/03/2019,Banglore,Delhi,BLR ? BOM ? DEL,Morning,10:25 13 Mar,25.3,1 stop,In-flight meal not included,11087,Banglore,one
9,Multiple carriers,27/05/2019,Delhi,Cochin,DEL ? BOM ? COK,Morning,19:15,7.5,1 stop,No info,8625,Delhi,one


In [29]:
# Change name to match clean_f2 
clean_f1 = clean_f1.rename(columns={'Dep_Time': 'departure_time'})
# Review dataset
clean_f1.head(5)

Unnamed: 0,airline,Date_of_Journey,source_city,destination_city,Route,departure_time,Arrival_Time,duration,specific_stops,Additional_Info,price,Sourke,general_stops
0,IndiGo,24/03/2019,Banglore,Delhi,BLR ? DEL,Evening,01:10 22 Mar,2.5,non-stop,No info,3897,Banglore,zero
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,Early_Morning,13:15,7.25,2 stops,No info,7662,Kolkata,two_or_more
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,Morning,04:25 10 Jun,19.0,2 stops,No info,13882,Delhi,two_or_more
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,Evening,23:30,5.25,non-stop,No info,6218,Kolkata,zero
4,Airline,01/03/2019,Banglore,Delhi,BLR ? NAG ? DEL,Afternoon,21:35,4.45,1 stop,No info,13302,Banglore,one


### Normalizing 'Arrival_Time' column

In [31]:
# Function to categorize time into time of day and remove day and month information
def categorize_arriv(time):
    # Extract the time part (ignore the day and month if present)
    time_part = time.split()[0]
    hour = int(time_part.split(':')[0])
    
    # Categorize the hour into time of day
    if 0 <= hour < 6:
        return 'Early_Morning'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 24:
        return 'Evening'
    else:
        return time  # Return the original time if it doesn't match any category

In [32]:
# Apply the categorization to the 'Arrival_Time' column in df1
clean_f1['Arrival_Time'] = clean_f1['Arrival_Time'].apply(categorize_arriv)
# Review changes 
clean_f1.head(10)

Unnamed: 0,airline,Date_of_Journey,source_city,destination_city,Route,departure_time,Arrival_Time,duration,specific_stops,Additional_Info,price,Sourke,general_stops
0,IndiGo,24/03/2019,Banglore,Delhi,BLR ? DEL,Evening,Early_Morning,2.5,non-stop,No info,3897,Banglore,zero
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,Early_Morning,Afternoon,7.25,2 stops,No info,7662,Kolkata,two_or_more
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,Morning,Early_Morning,19.0,2 stops,No info,13882,Delhi,two_or_more
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,Evening,Evening,5.25,non-stop,No info,6218,Kolkata,zero
4,Airline,01/03/2019,Banglore,Delhi,BLR ? NAG ? DEL,Afternoon,Evening,4.45,1 stop,No info,13302,Banglore,one
5,SpiceJet,24/06/2019,Kolkata,Banglore,CCU ? BLR,Morning,Morning,2.25,non-stop,No info,3873,Kolkata,zero
6,Airline,12/03/2019,Banglore,Delhi,BLR ? BOM ? DEL,Evening,Morning,15.3,1 stop,In-flight meal not included,11087,Banglore,one
7,Jet Airways,01/03/2019,Banglore,Delhi,BLR ? BOM ? DEL,Morning,Early_Morning,21.5,1 stop,No info,22270,Banglore,one
8,Jet Airways,12/03/2019,Banglore,Delhi,BLR ? BOM ? DEL,Morning,Morning,25.3,1 stop,In-flight meal not included,11087,Banglore,one
9,Multiple carriers,27/05/2019,Delhi,Cochin,DEL ? BOM ? COK,Morning,Evening,7.5,1 stop,No info,8625,Delhi,one


In [33]:
# Change name to match clean_f2 
clean_f1 = clean_f1.rename(columns={'Arrival_Time': 'arrival_time'})
# Review dataset
clean_f1.head(5)

Unnamed: 0,airline,Date_of_Journey,source_city,destination_city,Route,departure_time,arrival_time,duration,specific_stops,Additional_Info,price,Sourke,general_stops
0,IndiGo,24/03/2019,Banglore,Delhi,BLR ? DEL,Evening,Early_Morning,2.5,non-stop,No info,3897,Banglore,zero
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,Early_Morning,Afternoon,7.25,2 stops,No info,7662,Kolkata,two_or_more
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,Morning,Early_Morning,19.0,2 stops,No info,13882,Delhi,two_or_more
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,Evening,Evening,5.25,non-stop,No info,6218,Kolkata,zero
4,Airline,01/03/2019,Banglore,Delhi,BLR ? NAG ? DEL,Afternoon,Evening,4.45,1 stop,No info,13302,Banglore,one


### Normalizing 'Route' column
The column 'Route' from 'clean_f1' represents the different scales a trip could take, and it does not have any equivalent information in 'cleanf_f2'. Normally it could be considered the need to extract source_city and destination_cito from 'Route', but both columns are already available and similar in both datasets. 
Moreover, the 'Route' relevancy is correlated to the stops as if the route only has two cities there are non-stops. Since the details of the stops can not be provided of the datasets, there are only a few options left and the usefulness of the column is limited. Therefore, the column can be dropped. 

In [35]:
# Drop the 'Route' column from 'clean_f1'
clean_f1 = clean_f1.drop(columns=['Route'])

In [36]:
# Rewiew changes
clean_f1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11751 entries, 0 to 11750
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           11751 non-null  object 
 1   Date_of_Journey   11751 non-null  object 
 2   source_city       11751 non-null  object 
 3   destination_city  11751 non-null  object 
 4   departure_time    11751 non-null  object 
 5   arrival_time      11751 non-null  object 
 6   duration          11751 non-null  float64
 7   specific_stops    11751 non-null  object 
 8   Additional_Info   11751 non-null  object 
 9   price             11751 non-null  int64  
 10  Sourke            11751 non-null  object 
 11  general_stops     11751 non-null  object 
dtypes: float64(1), int64(1), object(10)
memory usage: 1.1+ MB


### Normalizing 'Additional_Info' column
In the dataset 'clean_f1', the column 'Additional_Info' has multiple values, among them 'No info' in the rows that lack additional information. In consequence, it makes sense to add a column with the same name in the dataset 'clean_f2' with the values 'No info'

In [38]:
# Change name in clean_f1 
clean_f1 = clean_f1.rename(columns={'Additional_Info': 'add_info'})
# Review dataset
clean_f1.head(5)

Unnamed: 0,airline,Date_of_Journey,source_city,destination_city,departure_time,arrival_time,duration,specific_stops,add_info,price,Sourke,general_stops
0,IndiGo,24/03/2019,Banglore,Delhi,Evening,Early_Morning,2.5,non-stop,No info,3897,Banglore,zero
1,Air India,1/05/2019,Kolkata,Banglore,Early_Morning,Afternoon,7.25,2 stops,No info,7662,Kolkata,two_or_more
2,Jet Airways,9/06/2019,Delhi,Cochin,Morning,Early_Morning,19.0,2 stops,No info,13882,Delhi,two_or_more
3,IndiGo,12/05/2019,Kolkata,Banglore,Evening,Evening,5.25,non-stop,No info,6218,Kolkata,zero
4,Airline,01/03/2019,Banglore,Delhi,Afternoon,Evening,4.45,1 stop,No info,13302,Banglore,one


In [39]:
# Create a column named 'Additional_Info' filled with the values 'No info' in 'clean_f2'
clean_f2['add_info'] = 'No info'
# Review dataset
clean_f2.head(5)

Unnamed: 0,airline,flight,source_city,departure_time,general_stops,arrival_time,destination_city,class,duration,days_left,price,specific_stops,add_info
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953,non-stop,No info
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953,non-stop,No info
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956,non-stop,No info
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955,non-stop,No info
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955,non-stop,No info


### Normalizing 'class' column
In the dataset 'clean_f2', the column 'class' has multiple values. However, there is no similar column in 'clean_f1', even though it is a useful piece of information. Since another column ('add_info') has used the value 'No info', it is logical to use it in a new column in 'clean_f1' to avoid having to drop the column. 

In [41]:
# Change name in clean_f1 
clean_f1['class'] = 'No info'
# Review dataset
clean_f1.head(5)

Unnamed: 0,airline,Date_of_Journey,source_city,destination_city,departure_time,arrival_time,duration,specific_stops,add_info,price,Sourke,general_stops,class
0,IndiGo,24/03/2019,Banglore,Delhi,Evening,Early_Morning,2.5,non-stop,No info,3897,Banglore,zero,No info
1,Air India,1/05/2019,Kolkata,Banglore,Early_Morning,Afternoon,7.25,2 stops,No info,7662,Kolkata,two_or_more,No info
2,Jet Airways,9/06/2019,Delhi,Cochin,Morning,Early_Morning,19.0,2 stops,No info,13882,Delhi,two_or_more,No info
3,IndiGo,12/05/2019,Kolkata,Banglore,Evening,Evening,5.25,non-stop,No info,6218,Kolkata,zero,No info
4,Airline,01/03/2019,Banglore,Delhi,Afternoon,Evening,4.45,1 stop,No info,13302,Banglore,one,No info


### Normalizing 'Date_of_Journey', 'days_left' and 'flight' column
It is not possible to normalize these columns, as they are missing in one of the datasets, and using the mode would not make sense, since it would be taming with the information. Their information has been reviewed and it is not relevant for further analysis. Therefore, such columns will be drop. 

In [43]:
# Drop the 'Date_of_Journey' column
clean_f1 = clean_f1.drop(columns=['Date_of_Journey'])

In [44]:
# Drop the 'days_left' and 'flight' columns
clean_f2 = clean_f2.drop(columns=['days_left'])
clean_f2 = clean_f2.drop(columns=['flight'])

In [45]:
# Review dataset
clean_f1.head(5)

Unnamed: 0,airline,source_city,destination_city,departure_time,arrival_time,duration,specific_stops,add_info,price,Sourke,general_stops,class
0,IndiGo,Banglore,Delhi,Evening,Early_Morning,2.5,non-stop,No info,3897,Banglore,zero,No info
1,Air India,Kolkata,Banglore,Early_Morning,Afternoon,7.25,2 stops,No info,7662,Kolkata,two_or_more,No info
2,Jet Airways,Delhi,Cochin,Morning,Early_Morning,19.0,2 stops,No info,13882,Delhi,two_or_more,No info
3,IndiGo,Kolkata,Banglore,Evening,Evening,5.25,non-stop,No info,6218,Kolkata,zero,No info
4,Airline,Banglore,Delhi,Afternoon,Evening,4.45,1 stop,No info,13302,Banglore,one,No info


In [46]:
# Review dataset
clean_f2.head(5)

Unnamed: 0,airline,source_city,departure_time,general_stops,arrival_time,destination_city,class,duration,price,specific_stops,add_info
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,5953,non-stop,No info
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,5953,non-stop,No info
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,5956,non-stop,No info
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,5955,non-stop,No info
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,5955,non-stop,No info


## Merge the Datasets
Finally, we can merge the datasets. Since some columns are only present in one of the datasets, we can use the combine_first method to merge them.

In [48]:
clean_f2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   source_city       300153 non-null  object 
 2   departure_time    300153 non-null  object 
 3   general_stops     300153 non-null  object 
 4   arrival_time      300153 non-null  object 
 5   destination_city  300153 non-null  object 
 6   class             300153 non-null  object 
 7   duration          300153 non-null  float64
 8   price             300153 non-null  int64  
 9   specific_stops    300153 non-null  object 
 10  add_info          300153 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 25.2+ MB


In [49]:
clean_f1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11751 entries, 0 to 11750
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           11751 non-null  object 
 1   source_city       11751 non-null  object 
 2   destination_city  11751 non-null  object 
 3   departure_time    11751 non-null  object 
 4   arrival_time      11751 non-null  object 
 5   duration          11751 non-null  float64
 6   specific_stops    11751 non-null  object 
 7   add_info          11751 non-null  object 
 8   price             11751 non-null  int64  
 9   Sourke            11751 non-null  object 
 10  general_stops     11751 non-null  object 
 11  class             11751 non-null  object 
dtypes: float64(1), int64(1), object(10)
memory usage: 1.1+ MB


In [50]:
# Merge the datasets
merged_df = pd.concat([clean_f1, clean_f2], axis=0, ignore_index=True)

In [51]:
# Review dataset
merged_df.head(5)

Unnamed: 0,airline,source_city,destination_city,departure_time,arrival_time,duration,specific_stops,add_info,price,Sourke,general_stops,class
0,IndiGo,Banglore,Delhi,Evening,Early_Morning,2.5,non-stop,No info,3897,Banglore,zero,No info
1,Air India,Kolkata,Banglore,Early_Morning,Afternoon,7.25,2 stops,No info,7662,Kolkata,two_or_more,No info
2,Jet Airways,Delhi,Cochin,Morning,Early_Morning,19.0,2 stops,No info,13882,Delhi,two_or_more,No info
3,IndiGo,Kolkata,Banglore,Evening,Evening,5.25,non-stop,No info,6218,Kolkata,zero,No info
4,Airline,Banglore,Delhi,Afternoon,Evening,4.45,1 stop,No info,13302,Banglore,one,No info


In [52]:
# Review the data and how many rows have missing information in each columns
merged_df.isna().sum()

airline                  0
source_city              0
destination_city         0
departure_time           0
arrival_time             0
duration                 0
specific_stops           0
add_info                 0
price                    0
Sourke              300153
general_stops            0
class                    0
dtype: int64

In [54]:
merged_df['airline'].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'Airline', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet', 'AirAsia',
       'GO_FIRST', 'Indigo', 'Air_India'], dtype=object)

In [55]:
# Correct spelling errors in airline column for rows that have similar name
merged_df['airline'] = merged_df['airline'].replace('Indigo', 'IndiGo')

In [56]:
# Correct spelling errors in airline column for rows that have similar name
merged_df['airline'] = merged_df['airline'].replace('Air Asia', 'AirAsia')

In [57]:
# Correct spelling errors in airline column for rows that have similar name
merged_df['airline'] = merged_df['airline'].replace('Air India', 'AirIndia')

In [58]:
# Correct spelling errors in airline column for rows that have similar name
merged_df['airline'] = merged_df['airline'].replace('Air_India', 'AirIndia')

In [59]:
# Correct spelling errors in airline column for rows that have similar name
merged_df['airline'] = merged_df['airline'].replace('Airline', 'Unkown Airline')

In [60]:
merged_df[merged_df['airline'] == 'Jet Airways Business']

Unnamed: 0,airline,source_city,destination_city,departure_time,arrival_time,duration,specific_stops,add_info,price,Sourke,general_stops,class
657,Jet Airways Business,Banglore,Delhi,Early_Morning,Morning,5.0,1 stop,No info,52229,Banglore,one,No info
2924,Jet Airways Business,Banglore,Delhi,Early_Morning,Morning,5.4,1 stop,Business class,79512,Banglore,one,No info
7351,Jet Airways Business,Delhi,Cochin,Evening,Early_Morning,8.2,2 stops,No info,46490,Delhi,two_or_more,No info
9715,Jet Airways Business,Delhi,Cochin,Evening,Early_Morning,8.2,2 stops,No info,52285,Delhi,two_or_more,No info
10364,Jet Airways Business,Banglore,Delhi,Morning,Afternoon,4.4,1 stop,Business class,57209,Banglore,one,No info


In [61]:
# Fill 'class' column with 'Business' in rows where 'airline' says 'Jet Airways Business'
merged_df.loc[merged_df['airline'] == 'Jet Airways Business', 'class'] = 'Business'

In [62]:
# Review changes
merged_df[merged_df['airline'] == 'Jet Airways Business']

Unnamed: 0,airline,source_city,destination_city,departure_time,arrival_time,duration,specific_stops,add_info,price,Sourke,general_stops,class
657,Jet Airways Business,Banglore,Delhi,Early_Morning,Morning,5.0,1 stop,No info,52229,Banglore,one,Business
2924,Jet Airways Business,Banglore,Delhi,Early_Morning,Morning,5.4,1 stop,Business class,79512,Banglore,one,Business
7351,Jet Airways Business,Delhi,Cochin,Evening,Early_Morning,8.2,2 stops,No info,46490,Delhi,two_or_more,Business
9715,Jet Airways Business,Delhi,Cochin,Evening,Early_Morning,8.2,2 stops,No info,52285,Delhi,two_or_more,Business
10364,Jet Airways Business,Banglore,Delhi,Morning,Afternoon,4.4,1 stop,Business class,57209,Banglore,one,Business


In [63]:
# Correct spelling errors in airline column for rows that have similar name
merged_df['airline'] = merged_df['airline'].replace('Jet Airways Business', 'Jet Airways')

In [64]:
merged_df['airline'].unique()

array(['IndiGo', 'AirIndia', 'Jet Airways', 'Unkown Airline', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'AirAsia',
       'Vistara Premium economy', 'Multiple carriers Premium economy',
       'Trujet', 'GO_FIRST'], dtype=object)

In [65]:
# Review the data and how many rows have missing information in each columns
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311904 entries, 0 to 311903
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           311904 non-null  object 
 1   source_city       311904 non-null  object 
 2   destination_city  311904 non-null  object 
 3   departure_time    311904 non-null  object 
 4   arrival_time      311904 non-null  object 
 5   duration          311904 non-null  float64
 6   specific_stops    311904 non-null  object 
 7   add_info          311904 non-null  object 
 8   price             311904 non-null  int64  
 9   Sourke            11751 non-null   object 
 10  general_stops     311904 non-null  object 
 11  class             311904 non-null  object 
dtypes: float64(1), int64(1), object(10)
memory usage: 28.6+ MB


In [66]:
# Save the merged dataset to a new CSV file
merged_df.to_csv('merged_df.csv', index=False)