# Data Exploration - AI/ML Market Analysis

This notebook performs comprehensive exploration of the AI/ML market datasets to understand the structure, quality, and patterns in the data.

## Datasets:
1. **AI_ML_popularity.csv**: Global AI popularity data by country/city with search trends
2. **The_Rise_of_AI.csv**: AI market metrics from 2018-2025 including revenue, adoption rates, job impacts

## Objectives:
- Load and examine all datasets
- Understand data structure and quality
- Identify missing values, outliers, and anomalies
- Generate statistical summaries
- Document initial insights

## 1. Import Required Libraries

In [2]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Utilities
import warnings
import os
from pathlib import Path

# Configuration
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")

Libraries imported successfully!
Pandas version: 2.3.1
Numpy version: 2.1.3


## 2. Load Datasets

In [3]:
# Define data paths
data_dir = Path('../data/raw')
popularity_file = data_dir / 'AI_ML_popularity.csv'
market_file = data_dir / 'The_Rise_of_AI.csv'

print("Loading datasets...")
print(f"Popularity data: {popularity_file.exists()}")
print(f"Market data: {market_file.exists()}")

# Load datasets with proper encoding handling
try:
    # Try UTF-8 first, then fallback to other encodings
    try:
        popularity_df = pd.read_csv(popularity_file, encoding='utf-8')
    except UnicodeDecodeError:
        popularity_df = pd.read_csv(popularity_file, encoding='latin-1')
    
    try:
        market_df = pd.read_csv(market_file, encoding='utf-8')
    except UnicodeDecodeError:
        market_df = pd.read_csv(market_file, encoding='latin-1')
    
    print("\n✅ Datasets loaded successfully!")
    print(f"Popularity dataset: {popularity_df.shape}")
    print(f"Market dataset: {market_df.shape}")
except Exception as e:
    print(f"❌ Error loading datasets: {e}")

Loading datasets...
Popularity data: True
Market data: True

✅ Datasets loaded successfully!
Popularity dataset: (250, 12)
Market dataset: (8, 22)


## 3. Initial Data Examination

In [4]:
# Dataset shapes and basic info
print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)

print(f"📊 AI Popularity Dataset Shape: {popularity_df.shape}")
print(f"📊 AI Market Dataset Shape: {market_df.shape}")

print("\n" + "=" * 60)
print("AI POPULARITY DATASET INFO")
print("=" * 60)
print(popularity_df.info())

print("\n" + "=" * 60)
print("AI MARKET DATASET INFO")
print("=" * 60)
print(market_df.info())

DATASET OVERVIEW
📊 AI Popularity Dataset Shape: (250, 12)
📊 AI Market Dataset Shape: (8, 22)

AI POPULARITY DATASET INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Country                         250 non-null    object 
 1   Ai and ML(Popularity)           72 non-null     float64
 2   City                            200 non-null    object 
 3   Ai and ML(Popularity).1         200 non-null    float64
 4   Top (Searches)                  25 non-null     object 
 5   Popularity                      25 non-null     float64
 6   Rising (Searches)               25 non-null     object 
 7   Rising (Searches) Percentage    25 non-null     object 
 8   Top (Searches).1                25 non-null     object 
 9   Popularity.1                    25 non-null     float64
 10  Rising (Searches).1             25 no

## 4. Data Structure Analysis

In [None]:
# Examine column names and first few rows
print("=" * 60)
print("AI POPULARITY DATASET - FIRST 5 ROWS")
print("=" * 60)
display(popularity_df.head())

print("\n" + "=" * 60)
print("AI POPULARITY DATASET - COLUMN NAMES")
print("=" * 60)
for i, col in enumerate(popularity_df.columns, 1):
    print(f"{i:2d}. {col}")

In [None]:
print("=" * 60)
print("AI MARKET DATASET - FIRST 5 ROWS")
print("=" * 60)
display(market_df.head())

print("\n" + "=" * 60)
print("AI MARKET DATASET - COLUMN NAMES")
print("=" * 60)
for i, col in enumerate(market_df.columns, 1):
    print(f"{i:2d}. {col}")

## 5. Missing Values Analysis

In [None]:
# Missing values analysis for popularity dataset
print("=" * 60)
print("MISSING VALUES ANALYSIS - AI POPULARITY DATASET")
print("=" * 60)

missing_pop = popularity_df.isnull().sum()
missing_pop_pct = (missing_pop / len(popularity_df)) * 100

missing_summary_pop = pd.DataFrame({
    'Missing_Count': missing_pop,
    'Missing_Percentage': missing_pop_pct
})
missing_summary_pop = missing_summary_pop[missing_summary_pop['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_summary_pop) > 0:
    display(missing_summary_pop)
else:
    print("✅ No missing values found in AI Popularity dataset!")

In [None]:
# Missing values analysis for market dataset
print("=" * 60)
print("MISSING VALUES ANALYSIS - AI MARKET DATASET")
print("=" * 60)

missing_market = market_df.isnull().sum()
missing_market_pct = (missing_market / len(market_df)) * 100

missing_summary_market = pd.DataFrame({
    'Missing_Count': missing_market,
    'Missing_Percentage': missing_market_pct
})
missing_summary_market = missing_summary_market[missing_summary_market['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_summary_market) > 0:
    display(missing_summary_market)
else:
    print("✅ No missing values found in AI Market dataset!")

## 6. Statistical Summary

In [None]:
# Statistical summary for market dataset (numerical columns)
print("=" * 60)
print("STATISTICAL SUMMARY - AI MARKET DATASET")
print("=" * 60)

display(market_df.describe())

# Check data types
print("\n" + "=" * 60)
print("DATA TYPES - AI MARKET DATASET")
print("=" * 60)
print(market_df.dtypes)

In [None]:
# Statistical summary for popularity dataset
print("=" * 60)
print("STATISTICAL SUMMARY - AI POPULARITY DATASET")
print("=" * 60)

# Select numeric columns for statistical analysis
numeric_cols = popularity_df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    display(popularity_df[numeric_cols].describe())
else:
    print("No numeric columns found for statistical analysis")

print("\n" + "=" * 60)
print("DATA TYPES - AI POPULARITY DATASET")
print("=" * 60)
print(popularity_df.dtypes)

## 7. Data Quality Assessment

In [None]:
# Data quality assessment function
def assess_data_quality(df, dataset_name):
    """
    Comprehensive data quality assessment
    """
    print(f"\n{'='*60}")
    print(f"DATA QUALITY ASSESSMENT - {dataset_name}")
    print(f"{'='*60}")
    
    # Basic stats
    print(f"📏 Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"💾 Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Missing values
    missing_total = df.isnull().sum().sum()
    missing_pct = (missing_total / (df.shape[0] * df.shape[1])) * 100
    print(f"❓ Missing values: {missing_total:,} ({missing_pct:.2f}%)")
    
    # Duplicates
    duplicates = df.duplicated().sum()
    print(f"🔄 Duplicate rows: {duplicates:,}")
    
    # Data types
    print(f"\n📊 Data Types Distribution:")
    dtype_counts = df.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"  - {dtype}: {count} columns")
    
    return {
        'shape': df.shape,
        'missing_total': missing_total,
        'missing_percentage': missing_pct,
        'duplicates': duplicates,
        'dtypes': dtype_counts
    }

# Assess both datasets
market_quality = assess_data_quality(market_df, "AI MARKET DATA")
popularity_quality = assess_data_quality(popularity_df, "AI POPULARITY DATA")

## 8. Initial Visual Exploration

In [None]:
# Create visualizations for market trends
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('AI Software Revenue Over Time', 'Global AI Market Value', 
                   'AI Adoption Rate', 'Job Impact Analysis'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# AI Software Revenue
fig.add_trace(
    go.Scatter(x=market_df['Year'], 
               y=market_df['AI Software Revenue(in Billions)'],
               mode='lines+markers',
               name='AI Revenue',
               line=dict(color='blue', width=3)),
    row=1, col=1
)

# Global AI Market Value
fig.add_trace(
    go.Scatter(x=market_df['Year'], 
               y=market_df['Global AI Market Value(in Billions)'],
               mode='lines+markers',
               name='Market Value',
               line=dict(color='green', width=3)),
    row=1, col=2
)

# AI Adoption Rate
fig.add_trace(
    go.Scatter(x=market_df['Year'], 
               y=market_df['AI Adoption (%)'].str.rstrip('%').astype(float),
               mode='lines+markers',
               name='Adoption Rate',
               line=dict(color='orange', width=3)),
    row=2, col=1
)

# Job Impact (Jobs Eliminated vs Created)
fig.add_trace(
    go.Scatter(x=market_df['Year'], 
               y=market_df['Estimated Jobs Eliminated by AI (millions)'],
               mode='lines+markers',
               name='Jobs Eliminated',
               line=dict(color='red', width=3)),
    row=2, col=2
)

fig.add_trace(
    go.Scatter(x=market_df['Year'], 
               y=market_df['Estimated New Jobs Created by AI (millions)'],
               mode='lines+markers',
               name='Jobs Created',
               line=dict(color='green', width=3)),
    row=2, col=2
)

fig.update_layout(height=800, title_text="AI Market Trends Overview (2018-2025)")
fig.show()

## 9. Detailed Column Analysis

In [None]:
# Detailed analysis of market dataset columns
print("=" * 60)
print("DETAILED COLUMN ANALYSIS - AI MARKET DATASET")
print("=" * 60)

for col in market_df.columns:
    print(f"\n📊 Column: {col}")
    print(f"   Data Type: {market_df[col].dtype}")
    print(f"   Non-null Count: {market_df[col].count()}/{len(market_df)}")
    
    if market_df[col].dtype == 'object':
        # For string columns
        unique_vals = market_df[col].nunique()
        print(f"   Unique Values: {unique_vals}")
        if unique_vals <= 10:
            print(f"   Values: {list(market_df[col].unique())}")
    else:
        # For numeric columns
        print(f"   Min: {market_df[col].min()}")
        print(f"   Max: {market_df[col].max()}")
        print(f"   Mean: {market_df[col].mean():.2f}")
        print(f"   Std: {market_df[col].std():.2f}")

## 10. Data Relationships and Correlations

In [None]:
# Prepare numeric data for correlation analysis
# Convert percentage columns to numeric
market_numeric = market_df.copy()

# Convert percentage columns
percentage_cols = [col for col in market_df.columns if '%' in str(market_df[col].iloc[0]) if pd.notna(market_df[col].iloc[0])]
print(f"Percentage columns found: {percentage_cols}")

for col in percentage_cols:
    if market_numeric[col].dtype == 'object':
        market_numeric[col] = pd.to_numeric(market_numeric[col].str.rstrip('%'), errors='coerce')

# Select numeric columns for correlation
numeric_cols = market_numeric.select_dtypes(include=[np.number]).columns
print(f"\nNumeric columns for correlation: {list(numeric_cols)}")

# Calculate correlation matrix
if len(numeric_cols) > 1:
    corr_matrix = market_numeric[numeric_cols].corr()
    
    # Create correlation heatmap
    plt.figure(figsize=(15, 12))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, annot=True, cmap='RdYlBu_r', center=0,
                square=True, mask=mask, cbar_kws={"shrink": .8})
    plt.title('AI Market Data - Correlation Matrix', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # Find strongest correlations
    print("\n🔗 STRONGEST CORRELATIONS (|r| > 0.7):")
    strong_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                strong_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
    
    for var1, var2, corr in sorted(strong_corr, key=lambda x: abs(x[2]), reverse=True):
        print(f"   {var1} ↔ {var2}: {corr:.3f}")

## 11. Initial Insights and Key Findings

In [None]:
# Generate initial insights
print("=" * 60)
print("🔍 INITIAL INSIGHTS AND KEY FINDINGS")
print("=" * 60)

# Market growth analysis
revenue_growth = ((market_df['AI Software Revenue(in Billions)'].iloc[-1] / 
                  market_df['AI Software Revenue(in Billions)'].iloc[0]) - 1) * 100
market_growth = ((market_df['Global AI Market Value(in Billions)'].iloc[-1] / 
                 market_df['Global AI Market Value(in Billions)'].iloc[0]) - 1) * 100

print(f"📈 AI Software Revenue Growth (2018-2025): {revenue_growth:.1f}%")
print(f"📈 AI Market Value Growth (2018-2025): {market_growth:.1f}%")

# Job impact analysis
jobs_eliminated_2025 = market_df['Estimated Jobs Eliminated by AI (millions)'].iloc[-1]
jobs_created_2025 = market_df['Estimated New Jobs Created by AI (millions)'].iloc[-1]
net_job_impact = jobs_created_2025 - jobs_eliminated_2025

print(f"\n👥 Job Impact by 2025:")
print(f"   🔴 Jobs Eliminated: {jobs_eliminated_2025} million")
print(f"   🟢 Jobs Created: {jobs_created_2025} million")
print(f"   ⚖️ Net Impact: {net_job_impact:+.1f} million jobs")

# Adoption trends
adoption_start = float(market_df['AI Adoption (%)'].iloc[0].rstrip('%'))
adoption_end = float(market_df['AI Adoption (%)'].iloc[-1].rstrip('%'))
adoption_change = adoption_end - adoption_start

print(f"\n🎯 AI Adoption Trends:")
print(f"   2018: {adoption_start}%")
print(f"   2025: {adoption_end}%")
print(f"   Change: +{adoption_change} percentage points")

print(f"\n💡 Key Observations:")
print(f"   • Exponential growth in AI market value")
print(f"   • Steady increase in organizational adoption")
print(f"   • Job displacement vs creation dynamics")
print(f"   • Strong correlation between market metrics")

## 12. Data Export for Next Phase

In [None]:
# Save exploration results for next notebooks
results_dir = Path('../results')
results_dir.mkdir(exist_ok=True)

# Save data quality summary
quality_summary = {
    'market_data_quality': market_quality,
    'popularity_data_quality': popularity_quality,
    'exploration_timestamp': pd.Timestamp.now().isoformat()
}

# Convert to DataFrame for easy saving
quality_df = pd.DataFrame([
    {'dataset': 'AI_Market', 'metric': 'rows', 'value': market_quality['shape'][0]},
    {'dataset': 'AI_Market', 'metric': 'columns', 'value': market_quality['shape'][1]},
    {'dataset': 'AI_Market', 'metric': 'missing_pct', 'value': market_quality['missing_percentage']},
    {'dataset': 'AI_Popularity', 'metric': 'rows', 'value': popularity_quality['shape'][0]},
    {'dataset': 'AI_Popularity', 'metric': 'columns', 'value': popularity_quality['shape'][1]},
    {'dataset': 'AI_Popularity', 'metric': 'missing_pct', 'value': popularity_quality['missing_percentage']},
])

quality_df.to_csv(results_dir / 'data_quality_summary.csv', index=False)
print("📁 Data quality summary saved to ../results/data_quality_summary.csv")

print("\n✅ Data Exploration Complete!")
print("📋 Next Steps:")
print("   1. Move to 02_data_cleaning.ipynb for data preprocessing")
print("   2. Use insights from this exploration to guide cleaning decisions")
print("   3. Focus on high-correlation variables for feature engineering")