In [None]:
# EDA for Classification Task
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load your dataset
# For demonstration, let's use a sample dataset (replace this with your actual dataset)
dataset = sns.load_dataset('iris')  # Using the 'iris' dataset for demonstration

# Display basic information about the dataset
print("Dataset Head:")
print(dataset.head())

# Display dataset info to check for missing values, data types, etc.
print("\nDataset Info:")
print(dataset.info())

# Check for missing values in the dataset
print("\nMissing Values:")
print(dataset.isnull().sum())

# Handle missing values if necessary (imputation)
# Here, we use SimpleImputer to fill missing values with the mean
imputer = SimpleImputer(strategy='mean')
dataset_imputed = pd.DataFrame(imputer.fit_transform(dataset.select_dtypes(include=[np.number])), columns=dataset.select_dtypes(include=[np.number]).columns)

# Merge imputed numerical columns back with categorical data
dataset[dataset_imputed.columns] = dataset_imputed

# Check if missing values are handled
print("\nMissing Values After Imputation:")
print(dataset.isnull().sum())

# Check for duplicate rows
print("\nDuplicate Rows:")
print(dataset.duplicated().sum())

# Drop duplicate rows if any
dataset = dataset.drop_duplicates()

# EDA Visualizations
# 1. Distribution of Target Variable ('species' is the target variable)
sns.countplot(x='species', data=dataset)
plt.title('Distribution of Target Variable')
plt.xlabel('Species')
plt.ylabel('Count')
plt.show()

# 2. Pairplot to understand relationships between features
sns.pairplot(dataset, hue='species')
plt.show()

# 3. Correlation heatmap to analyze relationships between numerical features
corr_matrix = dataset.drop(columns='species').corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# 4. Boxplot to check for outliers in numerical features
sns.boxplot(x=dataset['sepal_length'])
plt.title('Boxplot for Sepal Length')
plt.show()

# 5. Scatter plot to visualize relationships between numerical features (e.g., 'sepal_length' vs 'sepal_width')
sns.scatterplot(x=dataset['sepal_length'], y=dataset['sepal_width'], hue=dataset['species'])
plt.title('Scatter Plot: Sepal Length vs Sepal Width')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.show()

# 6. Histogram of numerical features (e.g., 'petal_length')
sns.histplot(dataset['petal_length'], kde=True)
plt.title('Distribution of Petal Length')
plt.xlabel('Petal Length')
plt.ylabel('Frequency')
plt.show()

# 7. Pairplot to explore relationships between multiple features
sns.pairplot(dataset, hue='species')
plt.show()

# 8. Heatmap for categorical features (if applicable)
# Example with encoding 'species' (this is already numerical in this case, but for general cases):
# encoder = pd.get_dummies(dataset['species'], drop_first=True)
# dataset = pd.concat([dataset, encoder], axis=1)
# dataset = dataset.drop(columns=['species'])

# Check the relationship of the target variable ('species') with numerical features
for col in dataset.select_dtypes(include=[np.number]).columns:
    sns.violinplot(x=dataset['species'], y=dataset[col])
    plt.title(f'Violin Plot: {col} vs Species')
    plt.show()

# Train-Test Split
# Let's assume 'species' is the target variable for classification
X = dataset.drop(columns=['species'])  # Drop target column
y = dataset['species']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standard Scaling for classification models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Final dataset ready for modeling
print("\nTraining Set Shape:", X_train_scaled.shape)
print("Test Set Shape:", X_test_scaled.shape)


## Correlation Matrix

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder

# Load your dataset (replace with your actual dataset)
dataset = sns.load_dataset('iris')  # Using 'iris' dataset for demonstration

# Display basic information about the dataset
print("Dataset Info:")
print(dataset.info())

# Step 1: Correlation Matrix for Numerical Variables (Pearson's correlation)
# Select only the numerical columns for Pearson correlation
numerical_cols = dataset.select_dtypes(include=[np.number]).columns
correlation_matrix_numerical = dataset[numerical_cols].corr()

# Plot the correlation matrix for numerical variables
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix_numerical, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix for Numerical Variables')
plt.show()

# Step 2: Correlation for Categorical Variables (Cramér’s V or Chi-squared test)
# Select only the categorical columns
categorical_cols = dataset.select_dtypes(include=[object, 'category']).columns

# Cramér’s V function (for measuring association between categorical variables)
def cramers_v(x, y):
    # Create contingency table
    contingency_table = pd.crosstab(x, y)
    # Apply chi-squared test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()  # total observations
    return np.sqrt(chi2 / (n * min(contingency_table.shape) - 1))

# Generate correlation matrix for categorical variables
cramers_v_matrix = pd.DataFrame(np.ones((len(categorical_cols), len(categorical_cols))),
                                index=categorical_cols, columns=categorical_cols)

# Calculate Cramér's V between each pair of categorical variables
for i in range(len(categorical_cols)):
    for j in range(i, len(categorical_cols)):
        var1 = categorical_cols[i]
        var2 = categorical_cols[j]
        cramers_v_matrix.loc[var1, var2] = cramers_v(dataset[var1], dataset[var2])

# Plot the Cramér’s V matrix for categorical variables
plt.figure(figsize=(8, 6))
sns.heatmap(cramers_v_matrix.astype(float), annot=True, cmap='Blues', fmt='.2f', linewidths=0.5)
plt.title('Cramér’s V Correlation Matrix for Categorical Variables')
plt.show()

# Step 3: Correlation between Object (String) Variables with Numerical Features (using Label Encoding)
# Label encode categorical columns to numeric
label_encoder = LabelEncoder()
for col in categorical_cols:
    dataset[col] = label_encoder.fit_transform(dataset[col])

# Now we can calculate Pearson correlation with encoded variables
correlation_matrix_with_encoded_objects = dataset.corr()

# Plot the correlation matrix with encoded object variables
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix_with_encoded_objects, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix with Encoded Object Variables')
plt.show()

# Final notes:
# - Numerical correlation (Pearson's) is used for continuous variables.
# - Categorical correlation (Cramér’s V) is used for categorical variables.
# - For object variables, we can encode them using LabelEncoder and then apply Pearson correlation.
