# Getting Started with Data Analytics

This notebook demonstrates basic usage of the data analytics modules in this repository.

## 1. Loading Data

First, let's load a sample dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load sample employee data
df = pd.read_csv('../datasets/employee_data.csv')

print("Dataset shape:", df.shape)
df.head()

## 2. Data Exploration

Let's explore the basic statistics of our dataset.

In [None]:
# Basic statistics
print("\nBasic Statistics:")
df.describe()

In [None]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Data types
print("\nData Types:")
print(df.dtypes)

## 3. Data Visualization

Create some visualizations to understand the data better.

In [None]:
# Distribution of salaries
plt.figure(figsize=(10, 6))
plt.hist(df['salary'], bins=20, edgecolor='black', alpha=0.7)
plt.title('Distribution of Salaries', fontsize=16)
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Average salary by department
dept_salary = df.groupby('department')['salary'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
dept_salary.plot(kind='bar', color='skyblue', edgecolor='navy')
plt.title('Average Salary by Department', fontsize=16)
plt.xlabel('Department')
plt.ylabel('Average Salary')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: Experience vs Salary
plt.figure(figsize=(10, 6))
plt.scatter(df['years_experience'], df['salary'], alpha=0.6, s=50)
plt.title('Years of Experience vs Salary', fontsize=16)
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.grid(True, alpha=0.3)
plt.show()

## 4. Statistical Analysis

Perform some basic statistical analysis.

In [None]:
# Correlation between numerical variables
numerical_cols = ['age', 'salary', 'years_experience', 'performance_rating']
correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1)
plt.title('Correlation Matrix', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Group statistics
print("\nSalary Statistics by Department:")
df.groupby('department')['salary'].agg(['mean', 'median', 'std', 'min', 'max'])

## 5. Data Filtering and Aggregation

Filter and aggregate data to answer specific questions.

In [None]:
# Find high performers (rating >= 4)
high_performers = df[df['performance_rating'] >= 4]
print(f"\nNumber of high performers: {len(high_performers)}")
print(f"Average salary of high performers: ${high_performers['salary'].mean():.2f}")

In [None]:
# Employees with high experience and high salary
experienced_high_earners = df[(df['years_experience'] > 10) & (df['salary'] > 80000)]
print(f"\nExperienced high earners: {len(experienced_high_earners)}")
experienced_high_earners.head()

## 6. Summary

This notebook demonstrated:
- Loading and exploring data
- Creating visualizations
- Performing statistical analysis
- Filtering and aggregating data

For more examples, explore the other modules in this repository!