In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [20]:
# Load dataset
df = pd.read_csv('sales_data.csv')  # Replace with your actual filename

# Quick look at data
print(df.head())
print(df.info())
print(df.describe())

# Check for missing values
print("Missing values:\n", df.isnull().sum())


  Order_ID Product_ID   Category  Price  Quantity  Order_Date Customer_ID Country  Total_Sales
0  0   A0001     P123       Shoes    50.00       ...                                          
1  1   A0002     P456     Apparel    30.00       ...                                          
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column                                                                                          Non-Null Count  Dtype 
---  ------                                                                                          --------------  ----- 
 0     Order_ID Product_ID   Category  Price  Quantity  Order_Date Customer_ID Country  Total_Sales  2 non-null      object
dtypes: object(1)
memory usage: 148.0+ bytes
None
       Order_ID Product_ID   Category  Price  Quantity  Order_Date Customer_ID Country  Total_Sales
count                                                   2                                    

In [None]:
# Fill missing values if needed
df.fillna(df.median(numeric_only=True), inplace=True)

# Convert date column if present
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek

# Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

 
# Define features and target
X = df.drop('Price', axis=1)  # Assume 'sales' is the target variable
y = df['Price']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
model = RandomForestRegressor(random_state=42)

cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print("Cross-validation R² scores:", cv_scores)
print("Mean R²:", cv_scores.mean())


In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2'
)
grid_search.fit(X_train, y_train)

print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best CV R² Score:", grid_search.best_score_)


In [None]:
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10, 15]
}

random_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring='r2',
    random_state=42
)
random_search.fit(X_train, y_train)

print("Best parameters (Randomized Search):", random_search.best_params_)
print("Best CV R² Score:", random_search.best_score_)


In [None]:
best_model = grid_search.best_estimator_  # Or use random_search.best_estimator_
best_model.fit(X_train, y_train)

feature_importances = pd.Series(best_model.feature_importances_, index=X.columns)
feature_importances = feature_importances.sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=feature_importances.index)
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [None]:
y_pred = best_model.predict(X_test)

print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


In [None]:
# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
print("Ridge R² Score:", ridge.score(X_test, y_test))

# Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
print("Lasso R² Score:", lasso.score(X_test, y_test))
