In [None]:
# 📊 Exploratory Data Analysis (EDA) — Step-by-Step Guide
# EDA is the process of understanding, cleaning, visualizing, and preparing your data for modeling or decision-making. Here's how to do it:

In [2]:
# ✅ 1. Load and Understand the Data

import pandas as pd

df = pd.read_csv('your_dataset.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'

In [None]:
# 🔍 Inspect the Data

df.head()        # First 5 rows
df.tail()        # Last 5 rows
df.shape         # (rows, columns)
df.columns       # Column names
df.dtypes        # Data types of each column
df.info()        # Summary of dataset

In [None]:
# ✅ 2. Check for Missing (Null/NaN) Values
# Why?
# Missing values can distort analysis or cause errors in modeling.

df.isnull().sum()        # Count of nulls per column
df.isnull().mean()       # Proportion of nulls
df[df['column'].isnull()]  # View rows with missing value in column


In [1]:
# ✅ 3. Handle Missing Values
# 🔹 Numerical Columns

# Fill with mean or median
df['age'].fillna(df['age'].mean(), inplace=True)
df['score'].fillna(df['score'].median(), inplace=True)

# Drop rows/columns
df.dropna(axis=0, inplace=True)      # Drop rows with any NaN
df.dropna(axis=1, thresh=100, inplace=True)  # Drop columns with too many NaNs


NameError: name 'df' is not defined

In [None]:
# 🔹 Categorical Columns

# Fill with mode (most frequent)
df['gender'].fillna(df['gender'].mode()[0], inplace=True)


In [None]:
# ✅ 4. Data Type Conversion
# Why?
# Improves memory, makes analysis (e.g., datetime plots) more efficient.

# Convert string to datetime
df['date'] = pd.to_datetime(df['date'])

# Convert to category (saves memory)
df['gender'] = df['gender'].astype('category')


In [None]:
# ✅ 5. Understand Categorical Variables

df['gender'].value_counts()              # Frequency count
df['country'].value_counts(normalize=True)  # Proportion
df['gender'].nunique(), df['gender'].unique()  # How many unique?


In [None]:
# 🔹 Encoding Categorical Variables

# Label Encoding (for ordinal)
df['grade'] = df['grade'].map({'Low': 1, 'Medium': 2, 'High': 3})

# One-Hot Encoding (for nominal)
df = pd.get_dummies(df, columns=['country', 'gender'], drop_first=True)


In [None]:
# ✅ 6. Outlier Detection

df['salary'].describe()
df['salary'].plot.box()

# Using IQR
Q1 = df['salary'].quantile(0.25)
Q3 = df['salary'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['salary'] >= Q1 - 1.5 * IQR) & (df['salary'] <= Q3 + 1.5 * IQR)]

In [None]:
# ✅ 7. Feature Engineering

# Create new column
df['bmi'] = df['weight'] / (df['height']/100)**2

# Extract parts of datetime
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day


In [None]:
# ✅ 8. Grouping and Aggregation

df.groupby('gender')['salary'].mean()
df.groupby(['country', 'gender'])['age'].median()

# Pivot Table
df.pivot_table(values='sales', index='region', columns='product', aggfunc='sum')

In [None]:
# ✅ 9. Correlation Analysis

corr_matrix = df.corr(numeric_only=True)

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')


In [None]:
# ✅ 10. Visualizations for EDA

# Numerical
sns.histplot(df['age'], kde=True)
sns.boxplot(x=df['salary'])

# Categorical
sns.countplot(x='gender', data=df)

In [None]:
# 🔹 Bivariate/Multivariate Analysis

# Scatterplot
sns.scatterplot(x='age', y='salary', hue='gender', data=df)

# Pairplot
sns.pairplot(df[['age', 'salary', 'experience']])

# Grouped Boxplots
sns.boxplot(x='education', y='salary', data=df)

In [None]:
# ✅ 11. Reshape or Clean Specific Columns
# Example: Clean duration = "90 min" or "2 Seasons"

df['duration_mins'] = df['duration'].str.extract('(\d+)').astype(float)
df['is_season'] = df['duration'].str.contains('Season')


In [None]:
# ✅ 12. Export the Cleaned Data
df.to_csv('cleaned_data.csv', index=False)

In [None]:
# ✅ Summary: EDA Checklist

# - Load and understand the data
# - Check for missing values
# - Handle missing values
# - Convert data types
# - Understand categorical variables
# - Detect and handle outliers 
# - Feature engineering
# - Grouping and aggregation
# - Correlation analysis
# - Visualizations for EDA
# - Reshape or clean specific columns
# - Export the cleaned data

# | Task           | Tools/Methods                          |
# | -------------- | -------------------------------------- |
# | Load data      | `pd.read_csv()`                        |
# | View structure | `.info()`, `.head()`                   |
# | Missing values | `.isnull().sum()`, `.fillna()`         |
# | Data types     | `.dtypes`, `astype()`, `to_datetime()` |
# | Categorical    | `.value_counts()`, `get_dummies()`     |
# | Outliers       | `.describe()`, `boxplot()`, IQR        |
# | New features   | Create/transform columns               |
# | Aggregation    | `.groupby()`, `.pivot_table()`         |
# | Correlation    | `.corr()`, `sns.heatmap()`             |
# | Visualization  | `sns`, `plt`                           |
# | Save results   | `to_csv()`                             |
