In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Vehicle Price Prediction - Model Training
# AI Term Project - XGBoost Implementation

# ============================================================================
# PART 1: IMPORTS AND SETUP
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Plotting configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"XGBoost version: {xgb.__version__}")

In [None]:
# ============================================================================
# PART 2: DATA LOADING
# ============================================================================

# Load the dataset
df = pd.read_csv('/kaggle/input/vehicle-sales-data/car_prices.csv')  # Update path as needed

print("\n" + "="*70)
print("DATASET OVERVIEW")
print("="*70)
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nBasic statistics:")
print(df.describe())

In [None]:
# ============================================================================
# PART 3: OUTLIER DETECTION AND FILTERING
# ============================================================================

print("\n" + "="*70)
print("OUTLIER FILTERING")
print("="*70)

# Store original size
original_size = len(df)

# Filter outliers based on domain knowledge
df_filtered = df[
    (df['year'] >= 1990) &  # Keep vehicles from 1990 onwards
    (df['year'] <= 2026) &
    (df['odometer'] > 0) &
    (df['odometer'] <= 500000) &  # Remove unrealistic mileage
    (df['sellingprice'] > 500) &  # Remove suspiciously low prices
    (df['sellingprice'] <= 150000) &  # Remove extreme luxury outliers
    (df['mmr'] > 0) &
    (df['mmr'] <= 150000) &
    (df['condition'] >= 1) &
    (df['condition'] <= 49)
].copy()

# Additional statistical outlier removal using IQR for sellingprice
Q1 = df_filtered['sellingprice'].quantile(0.01)
Q3 = df_filtered['sellingprice'].quantile(0.99)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_filtered = df_filtered[
    (df_filtered['sellingprice'] >= lower_bound) &
    (df_filtered['sellingprice'] <= upper_bound)
]

print(f"Original dataset: {original_size:,} rows")
print(f"After filtering: {len(df_filtered):,} rows")
print(f"Removed: {original_size - len(df_filtered):,} rows ({100*(original_size - len(df_filtered))/original_size:.2f}%)")


In [None]:
# ============================================================================
# PART 4: STRATIFIED SAMPLING
# ============================================================================

print("\n" + "="*70)
print("STRATIFIED SAMPLING")
print("="*70)

# Target sample size (10-20% of filtered data)
target_size = min(100000, int(len(df_filtered) * 0.2))

# Create stratification bins
df_filtered['year_bin'] = pd.cut(df_filtered['year'], bins=10, labels=False)
df_filtered['price_bin'] = pd.qcut(df_filtered['sellingprice'], q=10, labels=False, duplicates='drop')

# Perform stratified sampling
df_sample = df_filtered.groupby(['year_bin', 'price_bin'], group_keys=False).apply(
    lambda x: x.sample(min(len(x), max(1, int(len(x) * target_size / len(df_filtered)))), random_state=RANDOM_STATE)
).reset_index(drop=True)

# Remove temporary binning columns
df_sample = df_sample.drop(['year_bin', 'price_bin'], axis=1)

print(f"Target sample size: {target_size:,}")
print(f"Actual sample size: {len(df_sample):,}")
print(f"Sampling ratio: {100*len(df_sample)/len(df_filtered):.2f}%")

# Verify distribution preservation
print("\nYear distribution comparison:")
print("Original:")
print(df_filtered['year'].value_counts(bins=5, sort=False).head())
print("\nSampled:")
print(df_sample['year'].value_counts(bins=5, sort=False).head())

In [None]:
# ============================================================================
# PART 5: HANDLE MISSING VALUES
# ============================================================================

print("\n" + "="*70)
print("HANDLE MISSING VALUES")
print("="*70)

# Handle missing values in transmission
df_sample['transmission'] = df_sample['transmission'].fillna('unknown')

print("Missing values handled:")
print(df_sample.isnull().sum())

In [None]:
# ============================================================================
# PART 6: ENCODING CATEGORICAL VARIABLES
# ============================================================================

print("\n" + "="*70)
print("ENCODING CATEGORICAL VARIABLES")
print("="*70)

# Create label encoders for categorical variables
categorical_cols = ['make', 'model', 'body', 'transmission']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_sample[f'{col}_encoded'] = le.fit_transform(df_sample[col].astype(str))
    label_encoders[col] = le
    print(f"{col}: {len(le.classes_)} unique values")


In [None]:
# ============================================================================
# PART 7: PREPARE FEATURES AND TARGET
# ============================================================================

print("\n" + "="*70)
print("PREPARING FEATURES AND TARGET")
print("="*70)

# Select features for the model (excluding mmr)
feature_cols = [
    'year', 'condition', 'odometer',
    'make_encoded', 'model_encoded', 'body_encoded', 'transmission_encoded'
]

X = df_sample[feature_cols]
y = df_sample['sellingprice']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures used:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i}. {col}")


In [None]:
# ============================================================================
# PART 8: TRAIN-TEST SPLIT
# ============================================================================

print("\n" + "="*70)
print("TRAIN-TEST SPLIT")
print("="*70)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"Split ratio: 80/20")

In [None]:
# ============================================================================
# PART 9: MODEL TRAINING - XGBOOST
# ============================================================================

print("\n" + "="*70)
print("XGBOOST MODEL TRAINING")
print("="*70)

# Define XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': RANDOM_STATE,
    'n_jobs': -1,
    'eval_metric': 'rmse'
}

print("Model parameters:")
for key, value in params.items():
    print(f"  {key}: {value}")

# Train the model
print("\nTraining XGBoost model...")
model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train, verbose=False)
print("Training completed!")


In [None]:
# ============================================================================
# PART 10: MODEL EVALUATION
# ============================================================================

print("\n" + "="*70)
print("MODEL EVALUATION")
print("="*70)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mape = np.mean(np.abs((y_train - y_train_pred) / y_train)) * 100
test_mape = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100

print("\nPERFORMANCE METRICS:")
print("-" * 50)
print(f"{'Metric':<20} {'Training':<15} {'Test':<15}")
print("-" * 50)
print(f"{'R² Score':<20} {train_r2:<15.4f} {test_r2:<15.4f}")
print(f"{'MAE ($)':<20} {train_mae:<15.2f} {test_mae:<15.2f}")
print(f"{'RMSE ($)':<20} {train_rmse:<15.2f} {test_rmse:<15.2f}")
print(f"{'MAPE (%)':<20} {train_mape:<15.2f} {test_mape:<15.2f}")
print("-" * 50)

# Cross-validation
print("\nPerforming 5-fold cross-validation...")
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print(f"CV R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [None]:
# ============================================================================
# PART 11: FEATURE IMPORTANCE ANALYSIS
# ============================================================================

print("\n" + "="*70)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*70)

# Get feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance Ranking:")
print("-" * 40)
for idx, row in importance_df.iterrows():
    print(f"{row['feature']:<25} {row['importance']:.4f}")

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance Score')
plt.title('Feature Importances - XGBoost Model')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
print("\nFeature importance plot saved as 'feature_importance.png'")
plt.show()

In [None]:
# ============================================================================
# PART 12: SAVE MODEL AND ENCODERS
# ============================================================================

print("\n" + "="*70)
print("SAVING MODEL AND ENCODERS")
print("="*70)

# Save the model
joblib.dump(model, 'vehicle_price_model.pkl')
print("Model saved as 'vehicle_price_model.pkl'")

# Save label encoders
joblib.dump(label_encoders, 'label_encoders.pkl')
print("Label encoders saved as 'label_encoders.pkl'")

# Save feature columns
joblib.dump(feature_cols, 'feature_columns.pkl')
print("Feature columns saved as 'feature_columns.pkl'")

# Save sample statistics for later use
statistics = {
    'train_size': len(X_train),
    'test_size': len(X_test),
    'r2_score': test_r2,
    'mae': test_mae,
    'rmse': test_rmse,
    'mape': test_mape,
    'feature_importance': importance_df.to_dict()
}
joblib.dump(statistics, 'model_statistics.pkl')
print("Model statistics saved as 'model_statistics.pkl'")

In [None]:
# ============================================================================
# PART 13: EXAMPLE PREDICTIONS
# ============================================================================

print("\n" + "="*70)
print("EXAMPLE PREDICTIONS")
print("="*70)

# Create example predictions
examples = [
    {'year': 2015, 'make': 'Ford', 'model': 'F-150', 'body': 'Truck', 
     'transmission': 'automatic', 'condition': 35, 'odometer': 50000},
    {'year': 2020, 'make': 'Toyota', 'model': 'Camry', 'body': 'Sedan',
     'transmission': 'automatic', 'condition': 40, 'odometer': 25000},
    {'year': 2010, 'make': 'Honda', 'model': 'Civic', 'body': 'Sedan',
     'transmission': 'automatic', 'condition': 25, 'odometer': 120000}
]

print("\nSample predictions:")
print("-" * 70)
for i, ex in enumerate(examples, 1):
    # Prepare features
    ex_df = pd.DataFrame([ex])
    
    # Encode categoricals
    for col in categorical_cols:
        if ex[col] in label_encoders[col].classes_:
            ex_df[f'{col}_encoded'] = label_encoders[col].transform([ex[col]])[0]
        else:
            ex_df[f'{col}_encoded'] = 0
    
    # Make prediction
    pred = model.predict(ex_df[feature_cols])[0]
    
    print(f"\nExample {i}:")
    print(f"  {ex['year']} {ex['make']} {ex['model']}")
    print(f"  Odometer: {ex['odometer']:,} miles | Condition: {ex['condition']}")
    print(f"  Predicted Price: ${pred:,.2f}")

print("\n" + "="*70)
print("MODEL TRAINING COMPLETE!")
print("="*70)
print("\nFiles created:")
print("  - vehicle_price_model.pkl")
print("  - label_encoders.pkl")
print("  - feature_columns.pkl")
print("  - model_statistics.pkl")
print("  - feature_importance.png")
print("\nYou can now use these files in your Streamlit application!")