# PART B: PREDICTIVE MODELING with cleaned_airline_reviews.csv

In [None]:
# Import Additional Libraries for ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the Pre-cleaned Dataset for Modeling

In [None]:
df_reviews = pd.read_csv('/kaggle/input/cleaned-airline-reviews/cleaned_airline_reviews.csv')
print("cleaned_airline_reviews.csv loaded successfully!")

# Step 2: Feature Engineering

In [None]:
df_reviews['satisfaction'] = (df_reviews['Rating'] >= 5).astype(int)
print("Target variable 'satisfaction' created.")
print(df_reviews['satisfaction'].value_counts(normalize=True))

# Step 3: Feature Selection for Satisfaction Prediction
To build an effective and interpretable model, we have carefully selected a subset of features from the cleaned_airline_reviews.csv dataset. Our goal is to use features that are logically connected to a passenger's experience without introducing data leakage or unnecessary complexity.

The following features were selected:
* **Sentiment_score (Numerical)**: This is the most critical feature. It is a direct quantitative measure of the opinion expressed in the review text. A higher score is expected to strongly correlate with passenger satisfaction.
* **Traveller_Type (Categorical):** The purpose of travel (e.g., Business, Leisure) can set different expectations for service, comfort, and price, thereby influencing the final satisfaction level.
* **Class (Categorical):** The cabin class (e.g., Economy, Business) is a primary determinant of the in-flight experience, including seat comfort, service quality, and amenities. It is a powerful predictor of satisfaction.
* **Verified (Categorical):** This flag indicates the authenticity of the review. It could potentially correlate with more genuine or extreme opinions, making it a useful feature to test.
* **Start_Location & End_Location (Categorical, High Cardinality):** These features capture the geographical context of the flight. The specific route can influence satisfaction due to factors like flight duration (long-haul vs. short-haul), airport quality, and crew assignments.
  * **Special Handling:** Due to the high number of unique locations (high cardinality), a standard One-Hot Encoder is unsuitable. We instead use a custom Frequency Encoder, which converts each location into a numerical feature representing its popularity in the dataset. This captures the "importance" of a route without creating an excessive number of features.

Features Excluded and Why:
* **Rating:** Excluded to prevent data leakage, as our target variable satisfaction is directly derived from it. Including this would result in a model that appears perfect but is useless for real-world prediction.
* **Flying_Month & Flying_Year:** These features were initially considered to capture seasonality and long-term trends. However, a data quality check revealed a significant number of missing values, making them unreliable for our model. They were therefore excluded in favor of more complete features.
* **Route:** While potentially useful, it is redundant now that we are using Start_Location and End_Location. It also suffers from the same high-cardinality issue.
* **Passenger_Name, Review_content, Review_title:** Excluded as they are identifiers or raw text. The essential information from Review_content has already been captured and quantified in the Sentiment_Score feature.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    """
    A custom transformer to encode categorical features by their frequency.
    """
    def __init__(self):
        self.freq_map_ = {}

    def fit(self, X, y=None):
        # Learn the frequency of each category for each column
        for col in X.columns:
            self.freq_map_[col] = X[col].value_counts().to_dict()
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        for col in X.columns:
            # Map the learned frequencies. Handle unknown categories by mapping them to 0.
            X_transformed[col] = X_transformed[col].map(self.freq_map_[col]).fillna(0)
        return X_transformed
        
features = [
        'Traveller_Type', 
        'Class', 
        'Verified', 
        'Sentiment_Score', 
        'Start_Location', 
        'End_Location'
    ]
target = 'satisfaction'

X_full = df_reviews[features].copy()
y = df_reviews[target]
    
# Define categorical and numerical feature groups
low_cardinality_categorical = ['Traveller_Type', 'Class', 'Verified']
high_cardinality_categorical = ['Start_Location', 'End_Location']
numerical_features = ['Sentiment_Score']

print("Converting categorical features to string type for consistency...")
for col in low_cardinality_categorical + high_cardinality_categorical:
    X_full[col] = X_full[col].astype(str)

# Step 4: Split The Data and Train The Model

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.2, random_state=42, stratify=y)
print(f"Data split with final features: Training ({X_train.shape}), Testing ({X_test.shape})")

# Build and Train the Machine Learning Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat_low', OneHotEncoder(handle_unknown='ignore'), low_cardinality_categorical),
        ('cat_high', FrequencyEncoder(), high_cardinality_categorical)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

print("\nTraining the final RandomForestClassifier model...")
pipeline.fit(X_train, y_train)
print("Model training complete!")

# Step 5: Evaluate The Model

In [None]:
print("\n--- Final Model Evaluation ---")
y_pred = pipeline.predict(X_test)
    
# Calculate and print metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
    
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Dissatisfied (0)', 'Satisfied (1)']))

# Visualize the Final Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Dissatisfied', 'Satisfied'], 
            yticklabels=['Dissatisfied', 'Satisfied'])
plt.title('Final Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# PART C: INFERENCE FUNCTION

In [None]:
def predict_satisfaction(raw_input_data, model_pipeline):
    """
    Accepts raw input data as a dictionary, converts it to a DataFrame,
    and returns a natural language prediction for passenger satisfaction.
    
    Args:
        raw_input_data (dict): A dictionary containing all the feature values.
        model_pipeline (Pipeline): The trained scikit-learn pipeline.
        
    Returns:
        str: A human-readable prediction string.
    """
    # Convert dictionary to a single-row DataFrame
    # The structure must match the training data exactly
    input_df = pd.DataFrame([raw_input_data])

    categorical_features = ['Traveller_Type', 'Class', 'Verified', 'Start_Location', 'End_Location']
    for col in categorical_features:
        if col in input_df.columns:
            input_df[col] = input_df[col].astype(str)
    
    # Use the pipeline to predict (it handles all preprocessing)
    prediction_numeric = model_pipeline.predict(input_df)[0]
    prediction_proba = model_pipeline.predict_proba(input_df)[0]
    
    # Convert numeric prediction to natural language
    if prediction_numeric == 1:
        satisfaction_level = "Satisfied"
        confidence = prediction_proba[1]
    else:
        satisfaction_level = "Dissatisfied"
        confidence = prediction_proba[0]
        
    return f"The passenger is predicted to be: {satisfaction_level} (Confidence: {confidence:.2%})"

# --- Demonstration of the Inference Function ---
print("\n--- Demonstrating the Inference Function ---")

# Example 1: A likely satisfied passenger
sample_satisfied = {
    'Traveller_Type': 'Business',
    'Class': 'Business',
    'Verified': True,
    'Sentiment_Score': 0.85, # Very positive review
    'Start_Location': 'London',
    'End_Location': 'New York'
}

# Example 2: A likely dissatisfied passenger
sample_dissatisfied = {
    'Traveller_Type': 'Solo Leisure',
    'Class': 'Economy',
    'Verified': False,
    'Sentiment_Score': -0.60, # Very negative review
    'Start_Location': 'Paris',
    'End_Location': 'Dubai'
}

# Use the function to get predictions
prediction1 = predict_satisfaction(sample_satisfied, pipeline) # 'pipeline' is your trained RF pipeline
prediction2 = predict_satisfaction(sample_dissatisfied, pipeline)

print(f"Prediction for Sample 1:\n{prediction1}")
print(f"\nPrediction for Sample 2:\n{prediction2}")