In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
# Load your dataset into a pandas DataFrame
df = pd.read_csv('your_dataset.csv')


# Statistical feature analysis 

## Correlation between features with heatmaps

In [None]:
# Compute the correlation matrix
corr = df.corr()

# Plot the heatmap
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


## Analyze features scales

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Select the features you want to analyze
features = df[['feature1', 'feature2', 'feature3']]

# Compute the mean and standard deviation of each feature
mean = features.mean()
std = features.std()

# Print the mean and standard deviation of each feature
print('Mean:')
print(mean)
print('Standard Deviation:')
print(std)

# Scale the features using StandardScaler from scikit-learn
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Convert the scaled features back to a DataFrame
scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

# Compute the mean and standard deviation of the scaled features
scaled_mean = scaled_df.mean()
scaled_std = scaled_df.std()

# Print the mean and standard deviation of the scaled features
print('Scaled Mean:')
print(scaled_mean)
print('Scaled Standard Deviation:')
print(scaled_std)


## Analyze features importance

### Gini importance for trees

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Load your dataset and split into features and target variable
X = df.drop('target', axis=1)
y = df['target']

# Initialize a Random Forest Classifier
rf = RandomForestClassifier()

# Fit the model to your data
rf.fit(X, y)

# Get the feature importances
importances = rf.feature_importances_

# Sort the feature importances in descending order
sorted_idx = importances.argsort()[::-1]

# Print the feature importances
for i in sorted_idx:
    print(f'Feature {X.columns[i]}: {importances[i]}')


### Permutation feature importance

In [None]:
from sklearn.inspection import permutation_importance

# Load your dataset and split into features and target variable
X = df.drop('target', axis=1)
y = df['target']

# Compute permutation feature importances
result = permutation_importance(rf, X, y, n_repeats=10, random_state=0)

# Get the feature importances
importances = result.importances_mean

# Sort the feature importances in descending order
sorted_idx = importances.argsort()[::-1]

# Print the feature importances
for i in sorted_idx:
    print(f'Feature {X.columns[i]}: {importances[i]}')


## Analyze features generalization

### Analyze with bias-variance tradeoff

In [None]:
from sklearn.model_selection import learning_curve
import numpy as np

# Load your dataset and split into features and target variable
X = df.drop('target', axis=1)
y = df['target']

# Define the model you want to analyze
model = RandomForestClassifier()

# Compute the learning curves
train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10))

# Compute the mean and standard deviation of train and test scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

# Plot the learning curves
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Score')
plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Validation Score')
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color='g')
plt.xlabel('Number of Training Samples')
plt.ylabel('Score')
plt.title('Learning Curves')
plt.legend(loc='best')
plt.show()



## Analyze features outliers

###  Z-score

In [None]:
# Calculate the Z-score for each feature
z_scores = (df - df.mean()) / df.std()

# Find the outliers based on Z-score threshold
outliers = df[z_scores.abs() > 3].dropna()

# Print the outliers
print(outliers)

### dbscan

In [None]:
from sklearn.cluster import DBSCAN

# Specify the features you want to analyze
X = df[['feature1', 'feature2', 'feature3']]

# Initialize and fit DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X)

# Get the labels assigned to each data point
labels

# Label Analysis

## Analyze data miss-balance

In [None]:
from sklearn.utils import resample

# Extract the feature matrix X and the target vector y
X = df.drop('target', axis=1)
y = df['target']

# Count the occurrences of each class in the target vector
class_counts = y.value_counts()

# Print the class counts
print('Class Counts:')
print(class_counts)

# Compute the class imbalance ratio
imbalance_ratio = class_counts.min() / class_counts.max()

# Print the class imbalance ratio
print('Class Imbalance Ratio:')
print(imbalance_ratio)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Upsample the minority class to balance the training set
df_train = pd.concat([X_train, y_train], axis=1)
df_minority = df_train[df_train['target'] == minority_class]  # Replace minority_class with the actual minority class label
df_majority = df_train[df_train['target'] == majority_class]  # Replace majority_class with the actual majority class label
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
df_train_balanced = pd.concat([df_majority, df_minority_upsampled])

# Count the occurrences of each class in the balanced training set
class_counts_balanced = df_train_balanced['target'].value_counts()

# Print the class counts in the balanced training set
print('Class Counts (Balanced Training Set):')
print(class_counts_balanced)
