# N8N Workflows Analysis

This notebook reads and analyzes the N8N_Workflows.csv file containing various AI automation workflows.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

## 2. Load the CSV File

In [None]:
# Read the CSV file
try:
    df = pd.read_csv('N8N_Workflows.csv')
    print("✅ File loaded successfully!")
    print(f"📊 Dataset shape: {df.shape}")
    print(f"📝 Number of workflows: {len(df)}")
except FileNotFoundError:
    print("❌ File 'N8N_Workflows.csv' not found. Please ensure it's in the same directory as this notebook.")
except Exception as e:
    print(f"❌ Error loading file: {e}")

## 3. Basic Data Exploration

In [None]:
# Display first few rows
print("\n📋 First 5 rows of the dataset:")
display(df.head())

In [None]:
# Display dataset information
print("\n📈 Dataset Info:")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\n📝 Column names and data types:")
print(df.dtypes)

In [None]:
# Data types and missing values
print("\n🔍 Data Types and Missing Values:")
df_info = pd.DataFrame({
    'Data Type': df.dtypes,
    'Non-Null Count': df.count(),
    'Null Count': df.isnull().sum(),
    'Null Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})
display(df_info)

## 4. Format Date Column to YYYY-MM-DD

In [None]:
# Format Date column to YYYY-MM-DD
print("\n📅 Formatting Date column...")
print("Before formatting:")
print(df['Date'].head())

# Convert to datetime and format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')

print("\nAfter formatting:")
print(df['Date'].head())

# Check for any failed conversions
failed_conversions = df['Date'].isna().sum()
if failed_conversions > 0:
    print(f"\n⚠️  Warning: {failed_conversions} dates could not be converted and were set to NaT")
else:
    print("\n✅ All dates converted successfully!")

## 5. Statistical Summary

In [None]:
# Statistical summary for numerical columns
print("\n📊 Statistical Summary (Numerical Columns):")
display(df.describe())

In [None]:
# Summary for categorical columns
print("\n📊 Summary for Categorical Columns:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n📌 {col}:")
    print(f"   Unique values: {df[col].nunique()}")
    if df[col].nunique() <= 15:  # Only show value counts if not too many unique values
        print(f"   Top values:")
        print(df[col].value_counts().head(10))

## 6. Data Visualization

In [None]:
# Distribution of FINAL SCORE
print("\n📈 Distribution of FINAL SCORE:")
plt.figure(figsize=(10, 6))
df['FINAL SCORE'].value_counts().sort_index().plot(kind='bar', alpha=0.7)
plt.title('Distribution of FINAL SCORE')
plt.xlabel('Final Score')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.show()

print(f"Average FINAL SCORE: {df['FINAL SCORE'].mean():.2f}")
print(f"Highest FINAL SCORE: {df['FINAL SCORE'].max()}")
print(f"Lowest FINAL SCORE: {df['FINAL SCORE'].min()}")

In [None]:
# Distribution by Origin
print("\n📊 Workflows by Origin:")
plt.figure(figsize=(12, 6))
origin_counts = df['Origin'].value_counts()
bars = plt.bar(range(len(origin_counts)), origin_counts.values, alpha=0.7)
plt.title('Number of Workflows by Origin')
plt.xlabel('Origin')
plt.ylabel('Count')
plt.xticks(range(len(origin_counts)), origin_counts.index, rotation=45)

# Add value labels on bars
for bar, count in zip(bars, origin_counts.values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height(), 
            str(count), ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Top Authors
print("\n👥 Top Authors by Number of Workflows:")
top_authors = df['Author'].value_counts().head(10)
plt.figure(figsize=(12, 6))
bars = plt.bar(range(len(top_authors)), top_authors.values, alpha=0.7)
plt.title('Top 10 Authors by Number of Workflows')
plt.xlabel('Author')
plt.ylabel('Number of Workflows')
plt.xticks(range(len(top_authors)), top_authors.index, rotation=45)

# Add value labels on bars
for bar, count in zip(bars, top_authors.values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height(), 
            str(count), ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Average Score by Origin
print("\n📊 Average FINAL SCORE by Origin:")
avg_score_by_origin = df.groupby('Origin')['FINAL SCORE'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
bars = plt.bar(range(len(avg_score_by_origin)), avg_score_by_origin.values, alpha=0.7)
plt.title('Average FINAL SCORE by Origin')
plt.xlabel('Origin')
plt.ylabel('Average FINAL SCORE')
plt.xticks(range(len(avg_score_by_origin)), avg_score_by_origin.index, rotation=45)

# Add value labels on bars
for bar, score in zip(bars, avg_score_by_origin.values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height(), 
            f'{score:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 7. Text Analysis - Workflow Categories

In [None]:
# Extract common categories from titles
print("\n🔍 Analyzing Workflow Categories from Titles:")

# Common AI/automation categories
categories = {
    'AI Agent': ['ai agent', 'agent', 'assistant'],
    'Content Generation': ['content', 'generate', 'writer', 'creator'],
    'Social Media': ['social', 'twitter', 'linkedin', 'instagram', 'tiktok', 'facebook'],
    'Video': ['video', 'reel', 'shorts', 'youtube', 'tiktok'],
    'Email': ['email', 'gmail', 'outlook'],
    'Scraping': ['scrape', 'scraper', 'crawl'],
    'Lead Generation': ['lead', 'prospect', 'outreach'],
    'Sales': ['sales', 'cold', 'crm'],
    'Marketing': ['marketing', 'ad', 'campaign'],
    'Analysis': ['analyze', 'analysis', 'insight', 'report']
}

# Count occurrences of each category
category_counts = {}
for category, keywords in categories.items():
    count = 0
    for title in df['Title'].str.lower():
        if any(keyword in title for keyword in keywords):
            count += 1
    category_counts[category] = count

# Plot category distribution
plt.figure(figsize=(12, 8))
sorted_categories = dict(sorted(category_counts.items(), key=lambda x: x[1], reverse=True))
bars = plt.barh(range(len(sorted_categories)), list(sorted_categories.values()), alpha=0.7)
plt.title('Workflow Categories (Based on Title Keywords)')
plt.xlabel('Number of Workflows')
plt.yticks(range(len(sorted_categories)), list(sorted_categories.keys()))

# Add value labels on bars
for i, (bar, count) in enumerate(zip(bars, sorted_categories.values())):
    plt.text(bar.get_width() + 0.5, bar.get_y() + bar.get_height()/2, 
            str(count), ha='left', va='center')

plt.tight_layout()
plt.show()

## 8. Data Quality Check

In [None]:
# Check for duplicates
print("\n🔍 Data Quality Check:")
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Check for empty strings
empty_strings = (df.applymap(lambda x: x == '')).sum().sum()
print(f"Number of empty strings: {empty_strings}")

# Check for whitespace-only strings
whitespace_only = (df.applymap(lambda x: isinstance(x, str) and x.strip() == '')).sum().sum()
print(f"Number of whitespace-only strings: {whitespace_only}")

# Check Setup Support column
setup_support_stats = df['Setup Support'].value_counts(dropna=False)
print(f"\n📞 Setup Support distribution:")
print(setup_support_stats)

## 9. Top Rated Workflows

In [None]:
# Show top rated workflows
print("\n🏆 Top Rated Workflows (FINAL SCORE = 5):")
top_workflows = df[df['FINAL SCORE'] == 5][['Title', 'Author', 'Origin', 'Date']]
display(top_workflows)

In [None]:
# Show most recent workflows
print("\n🆕 Most Recent Workflows:")
recent_workflows = df.sort_values('Date', ascending=False).head(10)[['Title', 'Author', 'Origin', 'Date', 'FINAL SCORE']]
display(recent_workflows)

## 10. Export Cleaned Data (Optional)

In [None]:
# Option to save cleaned data
save_cleaned = input("\n💾 Do you want to save a cleaned version of the data? (y/n): ")

if save_cleaned.lower() == 'y':
    # Create cleaned dataframe
    cleaned_df = df.copy()
    
    # Remove completely empty columns if any
    cleaned_df = cleaned_df.dropna(axis=1, how='all')
    
    # Save to new CSV
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f'N8N_Workflows_cleaned_{timestamp}.csv'
    cleaned_df.to_csv(filename, index=False)
    print(f"✅ Cleaned data saved as: {filename}")
    
    # Also save top workflows separately
    top_workflows_filename = f'N8N_Top_Workflows_{timestamp}.csv'
    df[df['FINAL SCORE'] >= 4][['Title', 'Description', 'Author', 'Origin', 'FINAL SCORE', 'Date', 'Link']].to_csv(top_workflows_filename, index=False)
    print(f"✅ Top workflows (score >=4) saved as: {top_workflows_filename}")
else:
    print("❌ Data export cancelled.")

## 11. Summary Statistics

In [None]:
print("\n" + "="*60)
print("📋 N8N WORKFLOWS ANALYSIS SUMMARY")
print("="*60)
print(f"📊 Total Workflows: {df.shape[0]:,}")
print(f"📝 Total Columns: {df.shape[1]}")
print(f"⭐ Average FINAL SCORE: {df['FINAL SCORE'].mean():.2f}/5")
print(f"🏆 Top Rated Workflows (5/5): {(df['FINAL SCORE'] == 5).sum()}")
print(f"👥 Unique Authors: {df['Author'].nunique()}")
print(f"🌐 Platforms: {', '.join(df['Origin'].unique())}")
print(f"📅 Date Range: {df['Date'].min()} to {df['Date'].max()}")
print(f"⚠️  Missing Values: {df.isnull().sum().sum()}")
print(f"🔍 Duplicate Rows: {df.duplicated().sum()}")
print("="*60)

# Top 3 authors
top_authors_summary = df['Author'].value_counts().head(3)
print("\n👑 Top 3 Most Prolific Authors:")
for author, count in top_authors_summary.items():
    print(f"   {author}: {count} workflows")

# Platform distribution
print("\n🌐 Workflow Distribution by Platform:")
for origin, count in df['Origin'].value_counts().items():
    percentage = (count / len(df) * 100)
    print(f"   {origin}: {count} workflows ({percentage:.1f}%)")