# n8n Templates CSV Analysis

This notebook reads and analyzes the n8n_Templates.csv file to explore the available workflow templates.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

## 2. Load the CSV File

In [None]:
# Read the CSV file
try:
    df = pd.read_csv('n8n_Templates.csv')
    print("✅ File loaded successfully!")
    print(f"📊 Dataset shape: {df.shape}")
except FileNotFoundError:
    print("❌ File 'n8n_Templates.csv' not found. Please ensure it's in the same directory as this notebook.")
except Exception as e:
    print(f"❌ Error loading file: {e}")

## 3. Basic Data Exploration

In [None]:
# Display first few rows
print("\n📋 First 5 rows of the dataset:")
display(df.head())

In [None]:
# Display dataset information
print("\n📈 Dataset Info:")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\n📝 Column names:")
for i, col in enumerate(df.columns, 1):
    print(f"{i}. {col}")

In [None]:
# Data types and missing values
print("\n🔍 Data Types and Missing Values:")
df_info = pd.DataFrame({
    'Data Type': df.dtypes,
    'Non-Null Count': df.count(),
    'Null Count': df.isnull().sum(),
    'Null Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})
display(df_info)

## 4. Statistical Summary

In [None]:
# Statistical summary for numerical columns
print("\n📊 Statistical Summary (Numerical Columns):")
display(df.describe())

In [None]:
# Summary for categorical columns
print("\n📊 Summary for Categorical Columns:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n📌 {col}:")
    print(f"   Unique values: {df[col].nunique()}")
    if df[col].nunique() <= 10:  # Only show value counts if not too many unique values
        print(f"   Value counts:")
        print(df[col].value_counts().head())

## 5. Data Visualization

In [None]:
# Plot distribution of numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns

if len(numerical_cols) > 0:
    print("\n📈 Distribution of Numerical Columns:")
    
    # Calculate number of subplots needed
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            df[col].hist(bins=30, ax=axes[i], alpha=0.7)
            axes[i].set_title(f'Distribution of {col}')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
    
    # Hide empty subplots
    for i in range(len(numerical_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns found in the dataset.")

In [None]:
# Top categories for categorical columns (for columns with reasonable number of unique values)
print("\n📊 Top Categories in Categorical Columns:")

for col in categorical_cols:
    unique_count = df[col].nunique()
    if 1 < unique_count <= 20:  # Only plot if reasonable number of categories
        plt.figure(figsize=(10, 6))
        
        # Get value counts and plot
        value_counts = df[col].value_counts().head(10)
        
        if len(value_counts) > 0:
            bars = plt.bar(range(len(value_counts)), value_counts.values, alpha=0.7)
            plt.title(f'Top {len(value_counts)} Categories in {col}')
            plt.xlabel(col)
            plt.ylabel('Count')
            
            # Add value labels on bars
            for bar, count in zip(bars, value_counts.values):
                plt.text(bar.get_x() + bar.get_width()/2, bar.get_height(), 
                        str(count), ha='center', va='bottom')
            
            plt.xticks(range(len(value_counts)), value_counts.index, rotation=45)
            plt.tight_layout()
            plt.show()

## 6. Data Quality Check

In [None]:
# Check for duplicates
print("\n🔍 Data Quality Check:")
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Check for empty strings
empty_strings = (df.applymap(lambda x: x == '')).sum().sum()
print(f"Number of empty strings: {empty_strings}")

# Check for whitespace-only strings
whitespace_only = (df.applymap(lambda x: isinstance(x, str) and x.strip() == '')).sum().sum()
print(f"Number of whitespace-only strings: {whitespace_only}")

## 7. Sample Data Exploration

In [None]:
# Display random sample of the data
print("\n🎲 Random Sample of 5 Rows:")
display(df.sample(5, random_state=42))

In [None]:
# Display last few rows
print("\n📋 Last 5 rows of the dataset:")
display(df.tail())

## 8. Export Cleaned Data (Optional)

In [None]:
# Option to save cleaned data
save_cleaned = input("\n💾 Do you want to save a cleaned version of the data? (y/n): ")

if save_cleaned.lower() == 'y':
    # Remove completely empty columns
    cleaned_df = df.dropna(axis=1, how='all')
    
    # Fill or drop other missing values based on your needs
    # For example: cleaned_df = cleaned_df.fillna('Unknown')
    
    # Save to new CSV
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f'n8n_Templates_cleaned_{timestamp}.csv'
    cleaned_df.to_csv(filename, index=False)
    print(f"✅ Cleaned data saved as: {filename}")
else:
    print("❌ Data export cancelled.")

## Summary

In [None]:
print("\n" + "="*50)
print("📋 ANALYSIS SUMMARY")
print("="*50)
print(f"📊 Total Records: {df.shape[0]:,}")
print(f"📝 Total Columns: {df.shape[1]}")
print(f"🔢 Numerical Columns: {len(numerical_cols)}")
print(f"📝 Categorical Columns: {len(categorical_cols)}")
print(f"⚠️  Missing Values: {df.isnull().sum().sum()}")
print(f"🔍 Duplicate Rows: {df.duplicated().sum()}")
print("="*50)