# Feature Engineering Experiments

This notebook documents the feature engineering experiments for the Flight Airfare Prediction project.

## Contents
1. Temporal Feature Extraction
2. Route Analytics
3. Categorical Encoding Strategies
4. Feature Importance Analysis
5. Feature Selection with Lasso

**Purpose**: Identify the most predictive features for airfare prediction

## Setup

In [None]:
import sys
from pathlib import Path
project_root = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

print("✓ Setup complete")

In [None]:
# Load data
df = pd.read_csv(project_root / "data" / "train.csv")
print(f"Data shape: {df.shape}")
df.head()

## 1. Temporal Feature Extraction

Extracting time-based patterns from journey dates and departure/arrival times.

In [None]:
# Parse Date_of_Journey
df['Journey_Date'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y')

# Extract temporal components
df['Journey_Day'] = df['Journey_Date'].dt.day
df['Journey_Month'] = df['Journey_Date'].dt.month
df['Journey_Year'] = df['Journey_Date'].dt.year
df['Journey_DayOfWeek'] = df['Journey_Date'].dt.dayofweek
df['Journey_WeekOfYear'] = df['Journey_Date'].dt.isocalendar().week

# Create weekend flag
df['IsWeekend'] = df['Journey_DayOfWeek'].isin([5, 6]).astype(int)

print("Temporal features created:")
print(df[['Date_of_Journey', 'Journey_Day', 'Journey_Month', 'Journey_DayOfWeek', 'IsWeekend']].head())

In [None]:
# Parse Departure Time
df['Dep_Hour'] = df['Dep_Time'].str.split(':').str[0].astype(int)
df['Dep_Minute'] = df['Dep_Time'].str.split(':').str[1].astype(int)

# Time of day categorization
def categorize_time(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['Time_Category'] = df['Dep_Hour'].apply(categorize_time)

# Create binary flags
df['IsMorningFlight'] = (df['Time_Category'] == 'Morning').astype(int)
df['IsAfternoonFlight'] = (df['Time_Category'] == 'Afternoon').astype(int)
df['IsEveningFlight'] = (df['Time_Category'] == 'Evening').astype(int)
df['IsNightFlight'] = (df['Time_Category'] == 'Night').astype(int)

print("\nTime category distribution:")
print(df['Time_Category'].value_counts())

In [None]:
# Experiment: Price by time of day
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df.groupby('Time_Category')['Price'].mean().plot(kind='bar', color=['#ff9999', '#66b3ff', '#99ff99', '#ffcc99'])
plt.title('Average Price by Time of Day')
plt.ylabel('Average Price (INR)')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
df.groupby('Journey_DayOfWeek')['Price'].mean().plot(kind='bar', color='steelblue')
plt.title('Average Price by Day of Week')
plt.ylabel('Average Price (INR)')
plt.xlabel('Day (0=Monday)')

plt.tight_layout()
plt.show()

## 2. Duration Feature Extraction

Parsing flight duration and deriving numeric features.

In [None]:
# Parse duration function
def parse_duration(duration):
    """Parse duration string like '2h 30m' to minutes"""
    duration = str(duration).lower().strip()
    hours = 0
    mins = 0
    
    if 'h' in duration:
        parts = duration.split('h')
        hours = int(parts[0].strip())
        if len(parts) > 1 and 'm' in parts[1]:
            mins = int(parts[1].replace('m', '').strip())
    elif 'm' in duration:
        mins = int(duration.replace('m', '').strip())
    
    return hours * 60 + mins

# Apply duration parsing
df['Duration_Minutes'] = df['Duration'].apply(parse_duration)

# Extract hours and minutes separately
df['Duration_Hours'] = df['Duration_Minutes'] // 60
df['Duration_Mins'] = df['Duration_Minutes'] % 60

print("Duration features:")
print(df[['Duration', 'Duration_Minutes', 'Duration_Hours', 'Duration_Mins']].head(10))

In [None]:
# Experiment: Duration vs Price correlation
plt.figure(figsize=(10, 6))
plt.scatter(df['Duration_Minutes'], df['Price'], alpha=0.3, s=10)
plt.xlabel('Duration (minutes)')
plt.ylabel('Price (INR)')
plt.title('Duration vs Price')

# Add trend line
z = np.polyfit(df['Duration_Minutes'], df['Price'], 1)
p = np.poly1d(z)
x_line = np.linspace(df['Duration_Minutes'].min(), df['Duration_Minutes'].max(), 100)
plt.plot(x_line, p(x_line), 'r--', label=f'Trend (slope={z[0]:.2f})')
plt.legend()
plt.show()

print(f"\nCorrelation: {df['Duration_Minutes'].corr(df['Price']):.3f}")

## 3. Route Analytics

Extracting features from route information.

In [None]:
# Parse Total Stops
def parse_stops(stops):
    if pd.isna(stops):
        return 0
    stops = str(stops).lower()
    if 'non-stop' in stops:
        return 0
    try:
        return int(stops.split()[0])
    except:
        return 0

df['Total_Stops_Num'] = df['Total_Stops'].apply(parse_stops)

# Direct flight flag
df['Is_Direct'] = (df['Total_Stops_Num'] == 0).astype(int)

# Route complexity from number of segments
df['Route_Segments'] = df['Route'].fillna('').str.split(' → ').str.len()

# City pair
df['City_Pair'] = df['Source'] + '_' + df['Destination']

print("Route features:")
print(df[['Route', 'Total_Stops_Num', 'Is_Direct', 'Route_Segments']].head())

In [None]:
# Experiment: Stops vs Price
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot: Stops vs Price
df.boxplot(column='Price', by='Total_Stops_Num', ax=axes[0])
axes[0].set_title('Price Distribution by Number of Stops')
axes[0].set_xlabel('Number of Stops')
axes[0].set_ylabel('Price (INR)')
plt.suptitle('')

# Mean price by stops
stops_price = df.groupby('Total_Stops_Num').agg({
    'Price': ['mean', 'count']
}).round(2)
stops_price.columns = ['Mean_Price', 'Count']
stops_price['Mean_Price'].plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Average Price by Number of Stops')
axes[1].set_xlabel('Number of Stops')
axes[1].set_ylabel('Average Price (INR)')

plt.tight_layout()
plt.show()

print("\nStops statistics:")
print(stops_price)

## 4. Categorical Encoding Experiments

Testing different encoding strategies for categorical variables.

In [None]:
# Encoding Strategy 1: Label Encoding
print("Testing Label Encoding...")

categorical_cols = ['Airline', 'Source', 'Destination', 'Additional_Info']
label_encoders = {}
df_encoded = df.copy()

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[f'{col}_LabelEnc'] = le.fit_transform(df_encoded[col].astype(str))
    label_encoders[col] = le
    print(f"{col}: {len(le.classes_)} classes")

# Show encoding example
print("\nLabel Encoding sample:")
print(df_encoded[['Airline', 'Airline_LabelEnc']].drop_duplicates().head(10))

In [None]:
# Encoding Strategy 2: Target (Mean) Encoding
print("Testing Target Encoding...")

for col in categorical_cols:
    target_means = df.groupby(col)['Price'].mean()
    df_encoded[f'{col}_TargetEnc'] = df[col].map(target_means)

# Show encoding example
print("\nTarget Encoding sample (Airline):")
target_enc_sample = df_encoded[['Airline', 'Airline_LabelEnc', 'Airline_TargetEnc']].drop_duplicates()
print(target_enc_sample.sort_values('Airline_TargetEnc', ascending=False))

In [None]:
# Encoding Strategy 3: Frequency Encoding
print("Testing Frequency Encoding...")

for col in categorical_cols:
    freq_map = df[col].value_counts(normalize=True)
    df_encoded[f'{col}_FreqEnc'] = df[col].map(freq_map)

# Show encoding example
print("\nFrequency Encoding sample (Airline):")
freq_enc_sample = df_encoded[['Airline', 'Airline_LabelEnc', 'Airline_TargetEnc', 'Airline_FreqEnc']].drop_duplicates()
print(freq_enc_sample.sort_values('Airline_FreqEnc', ascending=False))

In [None]:
# Compare encoding correlations with Price
print("\nCorrelation with Price for different encodings:")

encoding_correlations = {}
for col in categorical_cols:
    encoding_correlations[col] = {
        'Label': df_encoded[f'{col}_LabelEnc'].corr(df_encoded['Price']),
        'Target': df_encoded[f'{col}_TargetEnc'].corr(df_encoded['Price']),
        'Frequency': df_encoded[f'{col}_FreqEnc'].corr(df_encoded['Price'])
    }

encoding_df = pd.DataFrame(encoding_correlations).T
print(encoding_df.round(4))

# Plot comparison
encoding_df.plot(kind='bar', figsize=(10, 5))
plt.title('Encoding Strategy Comparison (Correlation with Price)')
plt.ylabel('Correlation')
plt.xticks(rotation=45)
plt.legend(title='Encoding')
plt.tight_layout()
plt.show()

## 5. Feature Importance Analysis

In [None]:
# Prepare features for importance analysis
feature_cols = [
    'Airline_LabelEnc', 'Source_LabelEnc', 'Destination_LabelEnc',
    'Journey_Day', 'Journey_Month', 'Journey_DayOfWeek', 'IsWeekend',
    'Dep_Hour', 'Dep_Minute', 'IsMorningFlight', 'IsEveningFlight',
    'Duration_Minutes', 'Total_Stops_Num', 'Is_Direct', 'Route_Segments'
]

X = df_encoded[feature_cols].fillna(0)
y = df_encoded['Price']

print(f"Features: {len(feature_cols)}")
print(f"Samples: {len(X)}")

In [None]:
# ExtraTreesRegressor Feature Importance
print("Computing feature importance with ExtraTreesRegressor...")

etr = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
etr.fit(X, y)

# Feature importance DataFrame
importance_df = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': etr.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance (ExtraTrees):")
print(importance_df)

# Plot
plt.figure(figsize=(10, 8))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='teal')
plt.xlabel('Importance')
plt.title('Feature Importance (ExtraTreesRegressor)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Random Forest Feature Importance (comparison)
print("Computing feature importance with RandomForest...")

rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X, y)

rf_importance = pd.DataFrame({
    'Feature': feature_cols,
    'RF_Importance': rf.feature_importances_,
    'ETR_Importance': etr.feature_importances_
}).sort_values('RF_Importance', ascending=False)

# Compare
print("\nComparison RF vs ExtraTrees:")
print(rf_importance)

## 6. Lasso-Based Feature Selection

In [None]:
# Standardize features for Lasso
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Test different alpha values
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]
results = []

for alpha in alphas:
    lasso = Lasso(alpha=alpha, max_iter=1000, random_state=42)
    lasso.fit(X_scaled, y)
    n_features = np.sum(lasso.coef_ != 0)
    results.append({'alpha': alpha, 'n_features': n_features})
    print(f"Alpha={alpha:.3f}: {n_features} features selected")

# Optimal alpha with SelectFromModel
print("\n" + "="*50)
optimal_alpha = 0.005
selector = SelectFromModel(Lasso(alpha=optimal_alpha, max_iter=1000, random_state=42))
selector.fit(X_scaled, y)

selected_features = X.columns[selector.get_support()].tolist()
print(f"\nOptimal Lasso (alpha={optimal_alpha}) selected features:")
for f in selected_features:
    print(f"  ✓ {f}")

In [None]:
# Lasso coefficients visualization
lasso = Lasso(alpha=0.005, max_iter=1000, random_state=42)
lasso.fit(X_scaled, y)

coef_df = pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': lasso.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

# Plot
plt.figure(figsize=(10, 8))
colors = ['green' if c > 0 else 'red' for c in coef_df['Coefficient']]
plt.barh(coef_df['Feature'], coef_df['Coefficient'], color=colors)
plt.xlabel('Coefficient')
plt.title('Lasso Feature Coefficients (alpha=0.005)')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nLasso Coefficients:")
print(coef_df)

## 7. Conclusions

### Top Features (by importance):
1. **Duration_Minutes** - Strongest predictor
2. **Total_Stops_Num** - High correlation with price
3. **Airline** - Brand premium effect
4. **Route/City Pair** - Route-specific pricing
5. **Dep_Hour** - Time of day matters

### Encoding Recommendations:
- Use **Label Encoding** for tree-based models
- Use **Target Encoding** for linear models
- Avoid One-Hot for high cardinality (Airline has 12 classes)

### Feature Engineering Wins:
- Time-based flags (IsWeekend, IsMorningFlight) add predictive value
- Route_Segments captures complexity
- Duration_Minutes >> raw Duration string