In [None]:
# Import the unified training data system
from unifiedTrainingData import (
    createUnifiedTrainingData, 
    loadUnifiedTrainingData, 
    getUnifiedTrainingDataStats,
    prepareUnifiedTrainingDataForModel
)

# Define your keywords
keywords = [
    'tesla', 'bmw', 'mercedes', 'audi', 'volkswagen', 'toyota', 'honda', 'ford', 
    'chevrolet', 'hyundai', 'kia', 'nissan', 'volvo', 'lexus', 'porsche', 
    'electric car', 'ev', 'electric vehicle', 'hybrid car', 'autonomous car',
    'self driving car', 'car sharing', 'ride sharing', 'uber', 'lyft',
    'car rental', 'car insurance', 'car maintenance', 'car repair',
    'car dealership', 'car financing', 'car loan', 'car lease'
]

# print(f"Creating unified training data for {len(keywords)} car and EV keywords...")
# print("This will save embeddings, ChatGPT trends, and Google Trends in a single JSON file.")
# print("Note: If Google Trends API fails, random realistic trends will be generated.")
# print("="*70)

# Create unified training data
summary = createUnifiedTrainingData(
    keywords=keywords,
    startDate="2025-06-01",
    endDate="2025-06-30",
    model='text-embedding-ada-002',
    filename='data/unified_training_data.json'
)

print(f"\n✅ Training data creation completed!")
print(f"Success rate: {summary['success_rate']:.2%}")

In [None]:
# Load and verify the created training data
print("\nVerifying created training data...")
data = loadUnifiedTrainingData()

print(f"Total keywords in training data: {len(data)}")

# Show sample data structure
if data:
    sample_keyword = list(data.keys())[0]
    embedding, chatgpt_trend, google_trend = data[sample_keyword]
    print(f"\nSample data for '{sample_keyword}':")
    print(f"  Embedding shape: {embedding.shape}")
    print(f"  ChatGPT trend shape: {chatgpt_trend.shape}")
    print(f"  Google trend shape: {google_trend.shape}")
    print(f"  Google trend sample: {google_trend[:5]}")
    print(f"  ChatGPT trend sample: {chatgpt_trend[:5]}")

# Prepare data for the embedding KNN model
print(f"\nPreparing data for embedding KNN model...")
prepareUnifiedTrainingDataForModel(data)
print(f"✅ Data prepared for model use!")

# Show final statistics
stats = getUnifiedTrainingDataStats()
print(f"\nFinal training data statistics:")
print(f"  Total keywords: {stats['total_keywords']}")
print(f"  File size: {stats['file_size_mb']:.2f} MB")
print(f"  Keywords: {stats['keywords'][:10]}...")  # Show first 10 keywords

In [None]:
# Test inference with the trained model
from embeddingsKNN import quickKeywordAnalysis, analyzeNewKeyword

# Test keywords
test_keywords = ['tesla model 3', 'bmw i4', 'hyundai ioniq']

print("Testing inference with trained model...")
print("="*50)

for keyword in test_keywords:
    print(f"\nTesting: {keyword}")
    try:
        # Quick analysis
        result = quickKeywordAnalysis(keyword, "2025-06-01", "2025-06-30")
        print(f"  Google trend mean: {result['google_trend_mean']:.1f}")
        print(f"  Predicted ChatGPT mean: {result['predicted_chatgpt_mean']:.1f}")
        print(f"  Prediction confidence: {result['prediction_confidence']:.3f}")
        print(f"  Nearest neighbors: {result['nearest_neighbors']}")
        
    except Exception as e:
        print(f"  ❌ Error: {e}")

In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from embeddingsKNN import quickKeywordAnalysis, analyzeNewKeyword
from unifiedTrainingData import loadUnifiedTrainingData, prepareUnifiedTrainingDataForModel

# Load and prepare the training data for the model
print("Loading unified training data...")
data = loadUnifiedTrainingData()
prepareUnifiedTrainingDataForModel(data)
print(f"✅ Model prepared with {len(data)} keywords")

# Test keywords for plotting
test_keywords = ['tesla model 3', 'bmw i4', 'hyundai ioniq', 'electric suv', 'autonomous driving']

# Create comprehensive plots
fig, axes = plt.subplots(len(test_keywords), 2, figsize=(15, 5*len(test_keywords)))
if len(test_keywords) == 1:
    axes = axes.reshape(1, -1)

print(f"Analyzing and plotting results for {len(test_keywords)} keywords...")

for i, keyword in enumerate(test_keywords):
    print(f"Processing {i+1}/{len(test_keywords)}: {keyword}")
    
    try:
        # Get detailed analysis
        result = analyzeNewKeyword(keyword, "2025-06-01", "2025-06-30")
        
        # Extract data
        google_trends = result['google_trends']['trend_values']
        predicted_chatgpt = result['predicted_chatgpt_trend']['trend_values']
        actual_chatgpt = result['actual_chatgpt_trend']['trend_values']
        dates = pd.date_range("2025-06-01", "2025-06-30", freq='D')
        
        # Plot 1: Google Trends vs Predicted ChatGPT Trends
        ax1 = axes[i, 0]
        ax1.plot(dates, google_trends, 'b-', label='Google Trends', linewidth=2, alpha=0.8)
        ax1.plot(dates, predicted_chatgpt, 'r--', label='Predicted ChatGPT', linewidth=2, alpha=0.8)
        ax1.plot(dates, actual_chatgpt, 'g-', label='Actual ChatGPT', linewidth=2, alpha=0.8)
        ax1.set_title(f'Trend Comparison: {keyword}', fontsize=14, fontweight='bold')
        ax1.set_ylabel('Trend Value', fontsize=12)
        ax1.legend(fontsize=10)
        ax1.grid(True, alpha=0.3)
        ax1.tick_params(axis='x', rotation=45)
        
        # Add correlation info
        correlation = np.corrcoef(google_trends, predicted_chatgpt)[0, 1]
        ax1.text(0.02, 0.98, f'Correlation: {correlation:.3f}', 
                transform=ax1.transAxes, fontsize=10, 
                verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
        
        # Plot 2: Neighbor Similarities
        ax2 = axes[i, 1]
        neighbors = result['neighbors']
        neighbor_names = [n['keyword'] for n in neighbors]
        similarities = [n['similarity'] for n in neighbors]
        
        bars = ax2.bar(neighbor_names, similarities, color='skyblue', alpha=0.7)
        ax2.set_title(f'Nearest Neighbors: {keyword}', fontsize=14, fontweight='bold')
        ax2.set_ylabel('Similarity Score', fontsize=12)
        ax2.set_ylim(0, 1)
        
        # Add similarity values on bars
        for bar, similarity in zip(bars, similarities):
            ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                    f'{similarity:.3f}', ha='center', va='bottom', fontsize=9)
        
        ax2.tick_params(axis='x', rotation=45)
        ax2.grid(True, alpha=0.3)
        
        # Add prediction confidence
        confidence = result['analysis_summary']['prediction_confidence']
        ax2.text(0.02, 0.98, f'Confidence: {confidence:.3f}', 
                transform=ax2.transAxes, fontsize=10, 
                verticalalignment='top', bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.8))
        
    except Exception as e:
        print(f"❌ Error processing '{keyword}': {e}")
        # Create empty plots for failed keywords
        axes[i, 0].text(0.5, 0.5, f'Error: {str(e)[:50]}...', 
                       ha='center', va='center', transform=axes[i, 0].transAxes,
                       bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.8))
        axes[i, 0].set_title(f'Error: {keyword}', fontsize=14, fontweight='bold')
        axes[i, 1].text(0.5, 0.5, 'No data available', 
                       ha='center', va='center', transform=axes[i, 1].transAxes)
        axes[i, 1].set_title(f'No neighbors: {keyword}', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Summary statistics
print(f"\n{'='*60}")
print("SUMMARY STATISTICS")
print(f"{'='*60}")

summary_data = []
for keyword in test_keywords:
    try:
        result = quickKeywordAnalysis(keyword, "2025-06-01", "2025-06-30")
        summary_data.append({
            'Keyword': keyword,
            'Google Trend Mean': f"{result['google_trend_mean']:.1f}",
            'Predicted ChatGPT Mean': f"{result['predicted_chatgpt_mean']:.1f}",
            'Prediction Confidence': f"{result['prediction_confidence']:.3f}",
            'Trend Magnitude': result['trend_magnitude'],
            'Volatility Level': result['volatility_level'],
            'Nearest Neighbors': ', '.join(result['nearest_neighbors'][:3])
        })
    except Exception as e:
        summary_data.append({
            'Keyword': keyword,
            'Google Trend Mean': 'Error',
            'Predicted ChatGPT Mean': 'Error',
            'Prediction Confidence': 'Error',
            'Trend Magnitude': 'Error',
            'Volatility Level': 'Error',
            'Nearest Neighbors': 'Error'
        })

# Create summary table
summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

# Additional detailed analysis plot
print(f"\n{'='*60}")
print("DETAILED ANALYSIS PLOT")
print(f"{'='*60}")

# Create a single comprehensive plot for one keyword
if test_keywords:
    keyword = test_keywords[0]  # Use first keyword for detailed analysis
    try:
        result = analyzeNewKeyword(keyword, "2025-06-01", "2025-06-30")
        
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
        
        dates = pd.date_range("2025-06-01", "2025-06-30", freq='D')
        google_trends = result['google_trends']['trend_values']
        predicted_chatgpt = result['predicted_chatgpt_trend']['trend_values']
        actual_chatgpt = result['actual_chatgpt_trend']['trend_values']
        
        # Plot 1: All trends together
        ax1.plot(dates, google_trends, 'b-', label='Google Trends', linewidth=2)
        ax1.plot(dates, predicted_chatgpt, 'r--', label='Predicted ChatGPT', linewidth=2)
        ax1.plot(dates, actual_chatgpt, 'g-', label='Actual ChatGPT', linewidth=2)
        ax1.set_title(f'Complete Trend Analysis: {keyword}', fontsize=14, fontweight='bold')
        ax1.set_ylabel('Trend Value')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        ax1.tick_params(axis='x', rotation=45)
        
        # Plot 2: Prediction accuracy
        ax2.scatter(actual_chatgpt, predicted_chatgpt, alpha=0.6, color='purple')
        ax2.plot([0, 100], [0, 100], 'r--', alpha=0.5, label='Perfect Prediction')
        ax2.set_xlabel('Actual ChatGPT Trend')
        ax2.set_ylabel('Predicted ChatGPT Trend')
        ax2.set_title('Prediction Accuracy', fontsize=14, fontweight='bold')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # Plot 3: Trend correlation
        ax3.plot(dates, google_trends, 'b-', label='Google Trends', alpha=0.7)
        ax3.plot(dates, predicted_chatgpt, 'r-', label='Predicted ChatGPT', alpha=0.7)
        ax3.set_xlabel('Date')
        ax3.set_ylabel('Trend Value')
        ax3.set_title('Google vs Predicted ChatGPT Correlation', fontsize=14, fontweight='bold')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        ax3.tick_params(axis='x', rotation=45)
        
        # Plot 4: Neighbor similarities
        neighbors = result['neighbors']
        neighbor_names = [n['keyword'] for n in neighbors]
        similarities = [n['similarity'] for n in neighbors]
        
        bars = ax4.bar(neighbor_names, similarities, color='orange', alpha=0.7)
        ax4.set_title('Nearest Neighbors Similarity', fontsize=14, fontweight='bold')
        ax4.set_ylabel('Similarity Score')
        ax4.set_ylim(0, 1)
        ax4.tick_params(axis='x', rotation=45)
        
        # Add values on bars
        for bar, similarity in zip(bars, similarities):
            ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                    f'{similarity:.3f}', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
        
        # Print detailed metrics
        print(f"\nDetailed Metrics for '{keyword}':")
        print(f"  MSE: {result['prediction_metrics']['mse']:.4f}")
        print(f"  MAE: {result['prediction_metrics']['mae']:.4f}")
        print(f"  RMSE: {result['prediction_metrics']['rmse']:.4f}")
        print(f"  Correlation: {result['analysis_summary']['trend_correlation']:.4f}")
        print(f"  Confidence: {result['analysis_summary']['prediction_confidence']:.4f}")
        
    except Exception as e:
        print(f"❌ Error in detailed analysis for '{keyword}': {e}")

print(f"\n{'='*60}")
print("ANALYSIS COMPLETED")
print(f"{'='*60}")