---
## 1. Import Libraries

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Utilities
import glob
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
%matplotlib inline

# Plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Libraries imported successfully!")

---
## 2. Load Dataset

Load all UNSW-NB15 CSV files and combine them into a single DataFrame.

In [None]:
# TODO: Load all CSV files from data/ directory
# data_files = glob.glob('../data/UNSW-NB15_*.csv')
# df = pd.concat([pd.read_csv(file) for file in sorted(data_files)], ignore_index=True)

# TODO: Display basic info
# print(f"Total records: {len(df):,}")
# print(f"Total features: {df.shape[1]}")
# df.head()

---
## 3. Initial Data Inspection

In [None]:
# TODO: Check data types
# df.info()

In [None]:
# TODO: Check for missing values
# missing = df.isnull().sum()
# missing[missing > 0].sort_values(ascending=False)

In [None]:
# TODO: Summary statistics for numerical features
# df.describe().T

In [None]:
# TODO: Check categorical features
# categorical_cols = df.select_dtypes(include=['object']).columns
# print(f"Categorical features: {list(categorical_cols)}")

# for col in categorical_cols:
#     print(f"\n{col}: {df[col].nunique()} unique values")
#     print(df[col].value_counts().head())

---
## 4. Univariate Analysis & Outlier Detection

Analyze individual features and detect outliers using IQR method and Z-scores.

In [None]:
# TODO: Select numerical columns for analysis
# numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# # Remove id and label columns if present
# numerical_cols = [col for col in numerical_cols if col not in ['id', 'label']]

In [None]:
# TODO: Outlier detection using IQR method
# def detect_outliers_iqr(data, column):
#     """Detect outliers using Interquartile Range method"""
#     Q1 = data[column].quantile(0.25)
#     Q3 = data[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
#     return len(outliers), lower_bound, upper_bound

# # Apply to all numerical columns
# outlier_summary = {}
# for col in numerical_cols[:10]:  # First 10 features as example
#     count, lb, ub = detect_outliers_iqr(df, col)
#     outlier_summary[col] = {'count': count, 'percentage': (count/len(df))*100}

# pd.DataFrame(outlier_summary).T

In [None]:
# TODO: Visualize distributions with box plots
# fig, axes = plt.subplots(3, 3, figsize=(15, 12))
# axes = axes.ravel()

# for idx, col in enumerate(numerical_cols[:9]):
#     sns.boxplot(data=df, y=col, ax=axes[idx])
#     axes[idx].set_title(f'{col} - Outlier Detection')

# plt.tight_layout()
# plt.savefig('../outputs/univariate_boxplots.png', dpi=300, bbox_inches='tight')
# plt.show()

In [None]:
# TODO: Distribution plots (histograms)
# fig, axes = plt.subplots(3, 3, figsize=(15, 12))
# axes = axes.ravel()

# for idx, col in enumerate(numerical_cols[:9]):
#     axes[idx].hist(df[col].dropna(), bins=50, edgecolor='black', alpha=0.7)
#     axes[idx].set_title(f'{col} Distribution')
#     axes[idx].set_xlabel(col)
#     axes[idx].set_ylabel('Frequency')

# plt.tight_layout()
# plt.savefig('../outputs/univariate_histograms.png', dpi=300, bbox_inches='tight')
# plt.show()

---
## 5. Bivariate Analysis

Explore relationships between features and the target variable.

In [None]:
# TODO: Compare feature distributions by label (Normal vs Attack)
# sample_features = numerical_cols[:6]
# fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# axes = axes.ravel()

# for idx, col in enumerate(sample_features):
#     for label in df['label'].unique():
#         subset = df[df['label'] == label][col].dropna()
#         axes[idx].hist(subset, alpha=0.6, label=f'Label {label}', bins=30)
#     axes[idx].set_title(f'{col} by Label')
#     axes[idx].legend()

# plt.tight_layout()
# plt.savefig('../outputs/bivariate_analysis.png', dpi=300, bbox_inches='tight')
# plt.show()

---
## 6. Correlation Analysis

Generate correlation heatmap to identify multicollinearity.

In [None]:
# TODO: Compute correlation matrix
# correlation_matrix = df[numerical_cols].corr()

# # Plot heatmap
# plt.figure(figsize=(20, 16))
# sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
#             square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
# plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold')
# plt.tight_layout()
# plt.savefig('../outputs/correlation_heatmap.png', dpi=300, bbox_inches='tight')
# plt.show()

In [None]:
# TODO: Find highly correlated feature pairs (|correlation| > 0.8)
# high_corr_pairs = []
# for i in range(len(correlation_matrix.columns)):
#     for j in range(i+1, len(correlation_matrix.columns)):
#         if abs(correlation_matrix.iloc[i, j]) > 0.8:
#             high_corr_pairs.append({
#                 'Feature 1': correlation_matrix.columns[i],
#                 'Feature 2': correlation_matrix.columns[j],
#                 'Correlation': correlation_matrix.iloc[i, j]
#             })

# pd.DataFrame(high_corr_pairs).sort_values('Correlation', ascending=False)

---
## 7. Target Variable Distribution

In [None]:
# TODO: Check class balance
# label_counts = df['label'].value_counts()
# print("Target Variable Distribution:")
# print(label_counts)
# print(f"\nClass Balance: {label_counts[1]/label_counts[0]:.2%} (Attack/Normal)")

# # Visualization
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# # Count plot
# label_counts.plot(kind='bar', ax=ax1, color=['#2ecc71', '#e74c3c'])
# ax1.set_title('Target Variable Distribution', fontsize=14, fontweight='bold')
# ax1.set_xlabel('Label (0=Normal, 1=Attack)')
# ax1.set_ylabel('Count')
# ax1.set_xticklabels(['Normal', 'Attack'], rotation=0)

# # Pie chart
# ax2.pie(label_counts, labels=['Normal', 'Attack'], autopct='%1.1f%%', 
#         colors=['#2ecc71', '#e74c3c'], startangle=90)
# ax2.set_title('Class Distribution Percentage', fontsize=14, fontweight='bold')

# plt.tight_layout()
# plt.savefig('../outputs/target_distribution.png', dpi=300, bbox_inches='tight')
# plt.show()

---
## 8. Key Insights & Next Steps

### Summary of Findings:
- **Dataset Size:** [Fill in after loading]
- **Missing Values:** [Summarize]
- **Outliers:** [Summarize percentage]
- **Class Balance:** [Indicate if imbalanced]
- **Multicollinearity:** [List highly correlated features]

### Action Items for Next Notebook:
1. ✅ Handle missing values (imputation or removal)
2. ✅ Address outliers (capping, transformation, or removal)
3. ✅ Feature engineering (new features, interactions)
4. ✅ Encode categorical variables
5. ✅ Feature scaling/normalization
6. ✅ Handle class imbalance if needed

---
**Proceed to:** `02_preprocessing_feature_engineering.ipynb`