In [3]:
import pandas as pd

# Load the dataset
file_path = 'makemytrip.csv'
makemytrip_data = pd.read_csv(file_path)

# Attempting to load the dataset again with error_bad_lines set to False
try:
    makemytrip_data = pd.read_csv(file_path, error_bad_lines=False)
    dataset_info = makemytrip_data.info()
    dataset_head = makemytrip_data.head()
except Exception as e:
    error_message = str(e)
    dataset_info, dataset_head, error_message = None, None, error_message

dataset_info, dataset_head, error_message if error_message else "Loaded successfully"

ParserError: Error tokenizing data. C error: Expected 22 fields in line 22, saw 25


In [2]:
# Attempting to load the dataset again with error_bad_lines set to False
try:
    makemytrip_data = pd.read_csv(file_path, error_bad_lines=False)
    dataset_info = makemytrip_data.info()
    dataset_head = makemytrip_data.head()
except Exception as e:
    error_message = str(e)
    dataset_info, dataset_head, error_message = None, None, error_message

dataset_info, dataset_head, error_message if error_message else "Loaded successfully"



  makemytrip_data = pd.read_csv(file_path, error_bad_lines=False)


(None, None, "[Errno 2] No such file or directory: 'makemytrip.csv.csv'")

In [3]:
# Step 1: Data Cleaning
# Dropping columns with a high percentage of missing values
columns_to_drop = ['Flight Stops', 'Meals', 'Initial Payment For Booking', 'Date Change Rules']
makemytrip_data_cleaned = makemytrip_data.drop(columns=columns_to_drop)

# Step 2: Data Type Correction
# Converting 'Crawl Timestamp' and 'Travel Date' to datetime format
makemytrip_data_cleaned['Crawl Timestamp'] = pd.to_datetime(makemytrip_data_cleaned['Crawl Timestamp'], errors='coerce')
makemytrip_data_cleaned['Travel Date'] = pd.to_datetime(makemytrip_data_cleaned['Travel Date'], errors='coerce')

# Displaying basic information and first few rows of the cleaned data
cleaned_data_info = makemytrip_data_cleaned.info()
cleaned_data_head = makemytrip_data_cleaned.head()

cleaned_data_info, cleaned_data_head

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up the plotting style
sns.set(style="whitegrid")

# 1. Package Type Distribution
plt.figure(figsize=(12, 6))
sns.countplot(y='Package Type', data=makemytrip_data_cleaned, order = makemytrip_data_cleaned['Package Type'].value_counts().index)
plt.title('Distribution of Package Types')
plt.xlabel('Count')
plt.ylabel('Package Type')
plt.tight_layout()
plt.show()

# 2. Top Destinations Analysis
top_destinations = makemytrip_data_cleaned['Destination'].value_counts().head(10) # top 10 destinations
plt.figure(figsize=(12, 6))
sns.barplot(x=top_destinations.values, y=top_destinations.index)
plt.title('Top 10 Destinations')
plt.xlabel('Count')
plt.ylabel('Destination')
plt.tight_layout()
plt.show()

# 3. Price Analysis
plt.figure(figsize=(12, 6))
sns.histplot(makemytrip_data_cleaned['Price Per Two Persons'], bins=50, kde=True)
plt.title('Distribution of Price Per Two Persons')
plt.xlabel('Price Per Two Persons')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
sns.histplot(makemytrip_data_cleaned['Per Person Price'], bins=50, kde=True)
plt.title('Distribution of Per Person Price')
plt.xlabel('Per Person Price')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [5]:
# Analyzing the distribution of different package types
package_type_distribution = makemytrip_data_cleaned['Package Type'].value_counts()

package_type_distribution

In [6]:
# Analyzing the most common destinations
top_destinations = makemytrip_data_cleaned['Destination'].value_counts().head(10) # Top 10 destinations

top_destinations

In [7]:
# Descriptive statistics for 'Price Per Two Persons' and 'Per Person Price'
price_two_persons_stats = makemytrip_data_cleaned['Price Per Two Persons'].describe()
per_person_price_stats = makemytrip_data_cleaned['Per Person Price'].describe()

price_two_persons_stats, per_person_price_stats

In [8]:
# Dropping irrelevant columns: 'Crawl Timestamp', 'Page Url', 'Uniq Id'
makemytrip_data_cleaned = makemytrip_data.drop(columns=['Crawl Timestamp', 'Page Url', 'Uniq Id'])

# Displaying the updated dataframe structure
makemytrip_data_cleaned.info()

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

# Splitting the 'Destination' column into a list of destinations
makemytrip_data_cleaned['Destination'] = makemytrip_data_cleaned['Destination'].apply(lambda x: x.split('|'))

# Applying one-hot encoding
mlb = MultiLabelBinarizer()
destination_encoded = mlb.fit_transform(makemytrip_data_cleaned['Destination'])

# Creating a DataFrame from the encoded array
destination_encoded_df = pd.DataFrame(destination_encoded, columns=mlb.classes_)

# Displaying the shape and first few rows of the new DataFrame
destination_encoded_df_shape = destination_encoded_df.shape
destination_encoded_df_head = destination_encoded_df.head()

destination_encoded_df_shape, destination_encoded_df_head

In [10]:
# Filling missing values for specific columns

# For 'Hotel Details' and 'Airline', we'll replace missing values with 'Not Specified'
makemytrip_data_cleaned['Hotel Details'].fillna('Not Specified', inplace=True)
makemytrip_data_cleaned['Airline'].fillna('Not Specified', inplace=True)

# For 'Sightseeing Places Covered', we'll also use 'Not Specified'
makemytrip_data_cleaned['Sightseeing Places Covered'].fillna('Not Specified', inplace=True)

# For 'Cancellation Rules', using 'Not Specified' as well
makemytrip_data_cleaned['Cancellation Rules'].fillna('Not Specified', inplace=True)

# Checking the dataset after filling missing values
makemytrip_data_cleaned.isnull().sum()  # Displaying the count of missing values for each column after filling missing values

In [11]:
# Dropping columns with all missing values: 'Meals', 'Flight Stops', 'Initial Payment For Booking', 'Date Change Rules'
columns_to_drop = ['Meals', 'Flight Stops', 'Initial Payment For Booking', 'Date Change Rules']
makemytrip_data_cleaned.drop(columns=columns_to_drop, inplace=True)

# Checking the updated structure of the dataset
updated_dataset_info = makemytrip_data_cleaned.info()
updated_dataset_info

In [12]:
# Dropping the 'Onwards Return Flight Time' column
makemytrip_data_cleaned.drop(columns=['Onwards Return Flight Time'], inplace=True)

# Checking the updated structure of the dataset
updated_dataset_info_after_drop = makemytrip_data_cleaned.info()
updated_dataset_info_after_drop

In [13]:
# Re-applying one-hot encoding for the 'Destination' column after dataset updates
destination_encoded_again = mlb.transform(makemytrip_data_cleaned['Destination'])

# Creating a DataFrame from the encoded array
destination_encoded_df_again = pd.DataFrame(destination_encoded_again, columns=mlb.classes_)

# Displaying the shape and first few rows of the new DataFrame
destination_encoded_df_shape_again = destination_encoded_df_again.shape
destination_encoded_df_head_again = destination_encoded_df_again.head()

destination_encoded_df_shape_again, destination_encoded_df_head_again

In [14]:
# Recreating the MultiLabelBinarizer object and reapplying one-hot encoding for 'Destination'
mlb_recreated = MultiLabelBinarizer()
destination_encoded_recreated = mlb_recreated.fit_transform(makemytrip_data_cleaned['Destination'])

# Creating a DataFrame from the newly encoded array
destination_encoded_df_recreated = pd.DataFrame(destination_encoded_recreated, columns=mlb_recreated.classes_)

# Displaying the shape and first few rows of the new DataFrame
destination_encoded_df_shape_recreated = destination_encoded_df_recreated.shape
destination_encoded_df_head_recreated = destination_encoded_df_recreated.head()

destination_encoded_df_shape_recreated, destination_encoded_df_head_recreated

In [15]:
# Saving the current state of the notebook as an .ipynb file

notebook_name = "makemytrip_data_analysis.ipynb"

# Writing current notebook to file
%notebook "/mnt/data/{notebook_name}"

# Providing the link to the user
notebook_link = f"/mnt/data/{notebook_name}"
notebook_link