# 🚀 Machine Learning Analysis
## Data Mining - Product Sales Analysis
### Preprocessing, Clustering & Regression

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from preprocessing import DataPreprocessor
from kmeans import KMeans, ElbowAnalyzer
from regression import RegressionAnalyzer
from visualization import Visualizer

## Load Data

In [None]:
df = pd.read_csv('product_sales.csv')
print(f'Dataset shape: {df.shape}')
print(f'\nFirst 5 rows:')
print(df.head())
print(f'\nData types:')
print(df.dtypes)

## Data Preprocessing (25%)

In [None]:
preprocessor = DataPreprocessor(df)
preprocessed_df = preprocessor.preprocess()

In [None]:
print(f'\n✓ Preprocessing Complete!')
print(f'Original shape: {df.shape}')
print(f'Preprocessed shape: {preprocessed_df.shape}')

## K-Means Clustering (40%)

In [None]:
X = preprocessed_df[['price', 'cost', 'units_sold', 'promotion_frequency']].values
print(f'Features for clustering: {X.shape}')

In [None]:
analyzer = ElbowAnalyzer(X, k_range=range(2, 9))
analyzer.analyze()
optimal_k, best_kmeans = analyzer.get_optimal_k()

In [None]:
fig = Visualizer.plot_elbow_curve(
    list(analyzer.k_range),
    analyzer.wcss_values,
    analyzer.silhouette_scores,
    optimal_k
)
plt.show()

In [None]:
cluster_df = preprocessed_df.copy()
cluster_df['Cluster'] = best_kmeans.labels

print('\n' + '='*60)
print('CLUSTER STATISTICS')
print('='*60)

for cluster_id in range(optimal_k):
    cluster_data = cluster_df[cluster_df['Cluster'] == cluster_id]
    print(f'\nCluster {cluster_id}:')
    print(f'  Size: {len(cluster_data)} products')
    print(f'  Avg Price: ${cluster_data["price"].mean():.2f}')
    print(f'  Avg Units Sold: {cluster_data["units_sold"].mean():.0f}')
    print(f'  Avg Profit: ${cluster_data["profit"].mean():.2f}')

In [None]:
fig = Visualizer.plot_clusters(
    X[:, :2],
    best_kmeans.labels,
    best_kmeans.centroids[:, :2],
    ['Price', 'Cost']
)
plt.show()

## Regression Analysis (35%)

In [None]:
X_reg = preprocessed_df[['price', 'cost', 'units_sold', 'promotion_frequency']].values
y_reg = preprocessed_df['profit'].values

print(f'Regression features shape: {X_reg.shape}')
print(f'Target shape: {y_reg.shape}')

In [None]:
regressor = RegressionAnalyzer(X_reg, y_reg)
regressor.train_linear_regression()
regressor.train_polynomial_regression(degree=2)

In [None]:
best_model_name, best_metrics = regressor.get_best_model()

In [None]:
fig = Visualizer.plot_regression_comparison(regressor.results)
plt.show()

In [None]:
fig = Visualizer.plot_actual_vs_predicted(
    regressor.y_test,
    best_metrics['y_test_pred'],
    regressor.y_test - best_metrics['y_test_pred']
)
plt.show()

## Summary & Results

In [None]:
print('\n' + '='*70)
print('PROJECT COMPLETION SUMMARY')
print('='*70)

print('\n✅ DATA PREPROCESSING (25%)')
print(f'  • Missing values handled')
print(f'  • Outliers detected and capped (IQR method)')
print(f'  • Features normalized using Min-Max scaling')
print(f'  • Final dataset shape: {preprocessed_df.shape}')

print('\n✅ K-MEANS CLUSTERING (40%)')
print(f'  • Algorithm: K-means with K-means++ initialization')
print(f'  • Optimal k: {optimal_k}')
print(f'  • Silhouette Score: {analyzer.silhouette_scores[optimal_k-2]:.3f}')
print(f'  • {optimal_k} distinct product clusters identified')

print('\n✅ REGRESSION ANALYSIS (35%)')
print(f'  • Linear Regression R²: {regressor.results["Linear"]["test_r2"]:.4f}')
print(f'  • Polynomial Regression R²: {regressor.results["Polynomial_2"]["test_r2"]:.4f}')
print(f'  • Best Model: {best_model_name}')
print(f'  • Best Model R²: {best_metrics["test_r2"]:.4f}')

print('\n🎉 PROJECT COMPLETE!')