In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [2]:
def load_and_preprocess_data(file_path):
    # Load data
    df = pd.read_csv(file_path)
    df = df.drop(columns=['Unnamed: 0'])
    
    # Create distance_per_ton
    df['distance_per_ton'] = df['travel_distance'] / df['Quantity (In TON)']
    df['distance_per_ton'] = df['distance_per_ton'].replace([np.inf, -np.inf], np.nan)
    df['distance_per_ton'] = df['distance_per_ton'].fillna(df['distance_per_ton'].mean())
    
    # Calculate route frequency
    route_frequency = df.groupby(['start_pin', 'destination_pin']).size().reset_index(name='frequency')
    df = df.merge(route_frequency, on=['start_pin', 'destination_pin'], how='left')
    
    # Calculate average route price
    avg_route_price = df.groupby(['start_pin', 'destination_pin'])['amount'].mean().reset_index(name='avg_route_price')
    df = df.merge(avg_route_price, on=['start_pin', 'destination_pin'], how='left')
    
    return df

In [3]:
def analyze_data_distribution(df):
    print("Target Variable (amount) Statistics:")
    print(df['amount'].describe())
    
    print("\nFeature Statistics:")
    for col in ['travel_distance', 'Quantity (In TON)', 'distance_per_ton', 'frequency', 'avg_route_price']:
        print(f"\n{col}:")
        print(df[col].describe())
    
    # Check for outliers
    print("\nOutlier Analysis:")
    for col in ['amount', 'travel_distance', 'Quantity (In TON)']:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]
        print(f"\nOutliers in {col}: {len(outliers)}")

In [5]:
def improved_preprocessing(df):
    # Handle outliers
    def remove_outliers(df, column):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        return df[(df[column] >= Q1 - 1.5 * IQR) & (df[column] <= Q3 + 1.5 * IQR)]
    
    # Remove outliers from key columns
    for col in ['amount', 'travel_distance', 'Quantity (In TON)']:
        df = remove_outliers(df, col)
    
    # Encode categorical variables
    le_start = LabelEncoder()
    le_dest = LabelEncoder()
    df['start_pin'] = le_start.fit_transform(df['start_pin'])
    df['destination_pin'] = le_dest.fit_transform(df['destination_pin'])
    
    # Log transform the target variable
    df['amount_log'] = np.log1p(df['amount'])
    
    return df

In [6]:
def create_improved_model(input_dim):
    model = Sequential([
        # Input layer
        Dense(64, input_dim=input_dim, activation='relu'),
        BatchNormalization(),
        
        # Hidden layers
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dropout(0.1),
        
        # Output layer
        Dense(1)
    ])
    
    optimizer = Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer, loss='mse')
    
    return model

In [7]:
def train_with_cv(X, y, input_dim):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        model = create_improved_model(input_dim)
        
        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )
        
        history = model.fit(
            X_train_fold, y_train_fold,
            validation_data=(X_val_fold, y_val_fold),
            epochs=100,
            batch_size=32,
            callbacks=[early_stopping],
            verbose=1
        )
        
        score = model.evaluate(X_val_fold, y_val_fold, verbose=0)
        scores.append(score)
        print(f'Fold {fold+1}: MSE = {score}')
    
    return np.mean(scores)

In [None]:
if __name__ == "__main__":
    df = load_and_preprocess_data('data.csv')
    analyze_data_distribution(df)
    df_processed = improved_preprocessing(df)
    numeric_features = ['travel_distance', 'Quantity (In TON)', 
                       'distance_per_ton', 'frequency', 'avg_route_price']
    scaler = RobustScaler()
    df_processed[numeric_features] = scaler.fit_transform(df_processed[numeric_features])
    X = df_processed[['start_pin', 'destination_pin'] + numeric_features].values
    y = df_processed['amount_log'].values
    input_dim = X.shape[1]
    mean_mse = train_with_cv(X, y, input_dim)
    print(f"\nMean MSE across all folds: {mean_mse}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    final_model = create_improved_model(input_dim)
    early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
    
    history = final_model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=100,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=1
    )
    
    # Make predictions
    predictions = final_model.predict(X_test)
    
    # Transform predictions back to original scale
    predictions_original = np.expm1(predictions)
    y_test_original = np.expm1(y_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test_original, predictions_original)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_original, predictions_original)
    
    print("\nFinal Model Metrics:")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R^2: {r2}")