In [None]:
# EDA for Regression Task
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load your dataset
# For demonstration, let's use a sample dataset (replace this with your actual dataset)
dataset = sns.load_dataset('tips')  # Using the 'tips' dataset for demonstration

# Display basic information about the dataset
print("Dataset Head:")
print(dataset.head())

# Display dataset info to check for missing values, data types, etc.
print("\nDataset Info:")
print(dataset.info())

# Check for missing values in the dataset
print("\nMissing Values:")
print(dataset.isnull().sum())

# Handle missing values if necessary (imputation)
# Here, we use SimpleImputer to fill missing values with the mean
imputer = SimpleImputer(strategy='mean')
dataset_imputed = pd.DataFrame(imputer.fit_transform(dataset.select_dtypes(include=[np.number])), columns=dataset.select_dtypes(include=[np.number]).columns)

# Merge imputed numerical columns back with categorical data
dataset[dataset_imputed.columns] = dataset_imputed

# Check if missing values are handled
print("\nMissing Values After Imputation:")
print(dataset.isnull().sum())

# Check for duplicate rows
print("\nDuplicate Rows:")
print(dataset.duplicated().sum())

# Drop duplicate rows if any
dataset = dataset.drop_duplicates()

# EDA Visualizations
# 1. Distribution of Target Variable ('total_bill' is the target variable)
sns.histplot(dataset['total_bill'], kde=True)
plt.title('Distribution of Total Bill')
plt.xlabel('Total Bill')
plt.ylabel('Frequency')
plt.show()

# 2. Pairplot to understand relationships between features
sns.pairplot(dataset[['total_bill', 'tip', 'size']])
plt.show()

# 3. Correlation heatmap to analyze relationships between numerical features
corr_matrix = dataset.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# 4. Boxplot to check for outliers in numerical features
sns.boxplot(x=dataset['total_bill'])
plt.title('Boxplot for Total Bill')
plt.show()

# 5. Scatter plot to visualize relationships between numerical features (e.g., 'total_bill' vs 'tip')
sns.scatterplot(x=dataset['total_bill'], y=dataset['tip'])
plt.title('Scatter Plot: Total Bill vs Tip')
plt.xlabel('Total Bill')
plt.ylabel('Tip')
plt.show()

# 6. Histogram of numerical features (e.g., 'tip')
sns.histplot(dataset['tip'], kde=True)
plt.title('Distribution of Tip')
plt.xlabel('Tip')
plt.ylabel('Frequency')
plt.show()

# 7. Pairplot to explore relationships between multiple features
sns.pairplot(dataset[['total_bill', 'tip', 'size', 'day']])
plt.show()

# Categorical Feature: 'day' (One-Hot Encoding)
# Convert categorical features into numerical features using OneHotEncoding
encoder = pd.get_dummies(dataset['day'], drop_first=True)

# Add encoded columns back to the dataset
dataset = pd.concat([dataset, encoder], axis=1)

# Drop the original categorical column 'day' after encoding
dataset = dataset.drop(columns=['day'])

# Display the updated dataset after One-Hot Encoding
print("\nDataset after One-Hot Encoding:")
print(dataset.head())

# Train-Test Split
# Let's assume 'total_bill' is the target variable for regression
X = dataset.drop(columns=['total_bill', 'tip'])  # Drop target columns
y = dataset['total_bill']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standard Scaling for regression models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Final dataset ready for modeling
print("\nTraining Set Shape:", X_train_scaled.shape)
print("Test Set Shape:", X_test_scaled.shape)
