# Task 1: Data Engineering & Feature Development
## Career Recommendation Engine

This notebook covers:
- Data loading and analysis
- Feature engineering
- Handling class imbalance
- Feature importance analysis
- Correlation matrix visualization

In [None]:
# Import required libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from preprocessing import DataPreprocessor, split_data
from feature_engineering import FeatureEngineer

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported successfully")

## 1. Data Loading and Initial Analysis

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor('../data/synthetic_user_profiles_large.csv')

# Load data
df = preprocessor.load_data()

# Display first few rows
print("\nFirst 5 rows:")
df.head()

In [None]:
# Basic statistics
print("Dataset Statistics:")
print(f"Total samples: {len(df)}")
print(f"\nNumerical columns summary:")
df[['analytical', 'creative', 'social', 'experience']].describe()

In [None]:
# Analyze unique values
print("Unique Skills:")
unique_skills = preprocessor.get_unique_values('skills')
print(f"Total: {len(unique_skills)}")
print(sorted(unique_skills))

print("\nUnique Interests:")
unique_interests = preprocessor.get_unique_values('interests')
print(f"Total: {len(unique_interests)}")
print(sorted(unique_interests))

print("\nUnique Careers:")
unique_careers = preprocessor.get_unique_values('target_careers')
print(f"Total: {len(unique_careers)}")
print(sorted(unique_careers))

## 2. Data Preprocessing

In [None]:
# Run complete preprocessing pipeline
df_processed, y_binary, career_names = preprocessor.get_preprocessed_data()

print("\nPreprocessed data shape:", df_processed.shape)
print("Target matrix shape:", y_binary.shape)
print("Number of careers:", len(career_names))

In [None]:
# Visualize personality trait distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

traits = ['analytical', 'creative', 'social']
colors = ['#3498db', '#e74c3c', '#2ecc71']

for ax, trait, color in zip(axes, traits, colors):
    ax.hist(df[trait], bins=30, color=color, alpha=0.7, edgecolor='black')
    ax.set_title(f'{trait.capitalize()} Trait Distribution', fontsize=12, fontweight='bold')
    ax.set_xlabel('Score')
    ax.set_ylabel('Frequency')
    ax.axvline(df[trait].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df[trait].mean():.2f}')
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Visualize education and experience distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Education distribution
education_counts = df['education'].value_counts()
axes[0].bar(education_counts.index, education_counts.values, color='#9b59b6', alpha=0.7)
axes[0].set_title('Education Level Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Education Level')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Experience distribution
axes[1].hist(df['experience'], bins=20, color='#f39c12', alpha=0.7, edgecolor='black')
axes[1].set_title('Work Experience Distribution', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Years of Experience')
axes[1].set_ylabel('Frequency')
axes[1].axvline(df['experience'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df['experience'].mean():.1f}')
axes[1].legend()

plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Initialize feature engineer
engineer = FeatureEngineer(df_processed)

# Engineer all features
df_features, feature_names = engineer.engineer_all_features()

print(f"\nTotal features created: {len(feature_names)}")
print(f"Feature matrix shape: {df_features[feature_names].shape}")

In [None]:
# Display feature statistics
feature_stats = engineer.get_feature_importance_data(df_features, feature_names)
print("\nFeature Statistics:")
feature_stats

In [None]:
# Visualize skill clusters
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Technical vs Soft Skills
axes[0, 0].scatter(df_features['technical_skills_count'], df_features['soft_skills_count'], 
                   alpha=0.5, c='#3498db')
axes[0, 0].set_xlabel('Technical Skills Count')
axes[0, 0].set_ylabel('Soft Skills Count')
axes[0, 0].set_title('Technical vs Soft Skills', fontweight='bold')

# Skill diversity distribution
axes[0, 1].hist(df_features['skill_diversity'], bins=30, color='#2ecc71', alpha=0.7, edgecolor='black')
axes[0, 1].set_xlabel('Skill Diversity Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Skill Diversity Distribution', fontweight='bold')

# Interest breadth
interest_counts = df_features['interest_breadth'].value_counts().sort_index()
axes[1, 0].bar(interest_counts.index, interest_counts.values, color='#e74c3c', alpha=0.7)
axes[1, 0].set_xlabel('Number of Interests')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Interest Breadth Distribution', fontweight='bold')

# Career readiness score
axes[1, 1].hist(df_features['career_readiness'], bins=30, color='#9b59b6', alpha=0.7, edgecolor='black')
axes[1, 1].set_xlabel('Career Readiness Score')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Career Readiness Distribution', fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Feature Correlation Analysis

In [None]:
# Calculate correlation matrix
X = df_features[feature_names]
correlation_matrix = X.corr()

# Visualize correlation matrix
plt.figure(figsize=(16, 14))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Find highly correlated features (|correlation| > 0.7)
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.7:
            high_corr_pairs.append({
                'Feature 1': correlation_matrix.columns[i],
                'Feature 2': correlation_matrix.columns[j],
                'Correlation': correlation_matrix.iloc[i, j]
            })

if high_corr_pairs:
    print("Highly Correlated Feature Pairs (|r| > 0.7):")
    pd.DataFrame(high_corr_pairs).sort_values('Correlation', ascending=False)
else:
    print("No highly correlated feature pairs found (|r| > 0.7)")

## 5. Class Imbalance Analysis

In [None]:
# Analyze class distribution
class_distribution = preprocessor.analyze_class_imbalance(y_binary, career_names)

# Visualize class distribution
plt.figure(figsize=(12, 6))
plt.barh(class_distribution['career'], class_distribution['count'], color='#3498db', alpha=0.7)
plt.xlabel('Number of Samples', fontsize=12)
plt.ylabel('Career', fontsize=12)
plt.title('Career Label Distribution (Class Imbalance)', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nImbalance Ratio: {class_distribution['count'].max() / class_distribution['count'].min():.2f}")

## 6. Feature Importance (Random Forest Based)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# Train a quick Random Forest for feature importance
X = df_features[feature_names].values
y = y_binary

# Quick RF model
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
multi_rf = MultiOutputClassifier(rf, n_jobs=-1)

print("Training Random Forest for feature importance analysis...")
multi_rf.fit(X, y)
print("✓ Training complete")

# Calculate average feature importance across all outputs
importances = np.mean([estimator.feature_importances_ for estimator in multi_rf.estimators_], axis=0)

# Create feature importance DataFrame
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance_df.head(10))

In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance_df.head(15)
plt.barh(top_features['feature'], top_features['importance'], color='#2ecc71', alpha=0.7)
plt.xlabel('Feature Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 15 Feature Importance (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Save Processed Data

In [None]:
# Save processed features and targets for model training
import os
import joblib

# Create processed data directory
os.makedirs('../data/processed', exist_ok=True)

# Save features
X_processed = df_features[feature_names]
X_processed.to_csv('../data/processed/features.csv', index=False)

# Save targets
np.save('../data/processed/targets.npy', y_binary)

# Save metadata
metadata = {
    'feature_names': feature_names,
    'career_names': career_names,
    'n_samples': len(X_processed),
    'n_features': len(feature_names),
    'n_careers': len(career_names)
}
joblib.dump(metadata, '../data/processed/metadata.pkl')

print("✓ Processed data saved successfully!")
print(f"  - Features: {X_processed.shape}")
print(f"  - Targets: {y_binary.shape}")
print(f"  - Career names: {len(career_names)}")

## Summary

### Key Findings:

1. **Dataset**: Successfully loaded and preprocessed user profile data
2. **Features**: Engineered 24 meaningful features from raw data
3. **Class Imbalance**: Identified imbalance in career labels (will handle in model training)
4. **Feature Importance**: Identified top contributing features for career prediction
5. **Correlations**: Analyzed feature relationships to avoid redundancy

### Next Steps:
- Proceed to Task 2: Model Development (notebook 02_model_development.ipynb)
- Train and evaluate multi-label classification models
- Implement confidence scoring system