In [12]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Load the dataset
file_name = r'D:\python\Data Science\Python\E-Commerce Sales Analysis\data\datasets\Sale Report.csv'
data = pd.read_csv(file_name)
data = data.dropna(subset=['Stock'])

# Define features (excluding 'index' and 'Stock') and target variable ('Stock')
X = data[['SKU Code', 'Design No.', 'Category', 'Size', 'Color']]
y = data['Stock']  # Target column

# Preprocess categorical features using one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), ['SKU Code', 'Design No.', 'Category', 'Size', 'Color'])
    ],
    remainder='passthrough'
)

# Create a pipeline for preprocessing and modeling
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

with open('sales_analysis_model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)