In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the Excel file
df = pd.read_excel('professional_sales_dataset.xlsx')

# Drop rows with missing values in required columns
df.dropna(subset=['State', 'District', 'Category', 'Quantity Sold', 
                  'Unit Price (₹)', 'Profit Margin', 'Profit (₹)'], inplace=True)

# Label Encoding for categorical variables
le_state = LabelEncoder()
le_district = LabelEncoder()
le_category = LabelEncoder()

df['State_enc'] = le_state.fit_transform(df['State'])
df['District_enc'] = le_district.fit_transform(df['District'])
df['Category_enc'] = le_category.fit_transform(df['Category'])

# Features
X = df[['State_enc', 'District_enc', 'Category_enc']]

# Targets
y_quantity = df['Quantity Sold']
y_unit_price = df['Unit Price (₹)']
y_profit_margin = df['Profit Margin']
y_profit = df['Profit (₹)']

# Train separate models
model_quantity = RandomForestRegressor()
model_price = RandomForestRegressor()
model_margin = RandomForestRegressor()
model_profit = RandomForestRegressor()

model_quantity.fit(X, y_quantity)
model_price.fit(X, y_unit_price)
model_margin.fit(X, y_profit_margin)
model_profit.fit(X, y_profit)

# Save models
joblib.dump(model_quantity, 'model_quantity.pkl')
joblib.dump(model_price, 'model_price.pkl')
joblib.dump(model_margin, 'model_margin.pkl')
joblib.dump(model_profit, 'model_profit.pkl')

# Save label encoders
encoders = {
    'State': le_state,
    'District': le_district,
    'Category': le_category
}
joblib.dump(encoders, 'label_encoders.pkl')

print("✅ Models and encoders saved successfully.")


✅ Models and encoders saved successfully.


In [2]:
pip install pandas scikit-learn joblib openpyxl


Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.6.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
# Train predictions and performance
y_train_pred_rf = rf_model.predict(X_train)
print("\nTraining Performance (Random Forest):")
print("R² Score:", r2_score(y_train, y_train_pred_rf))
print("MAE:", mean_absolute_error(y_train, y_train_pred_rf))
print("MSE:", mean_squared_error(y_train, y_train_pred_rf))



Training Performance (Random Forest):
R² Score: 0.9999385978554158
MAE: 109.47370059569099
MSE: 28532.52004992278


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import joblib

# Load the new dataset with Month and Year
df = pd.read_excel("final_professional_sales_dataset_with_month_year.xlsx")

# Define features and target
X = df[["Month", "Year"]]  # only Month and Year
y = df["Sales (₹)"]        # target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")

# Save the trained model
joblib.dump(model, "month_year_sales_model.pkl")
print("✅ Model saved as 'month_year_sales_model.pkl'")


R2 Score: -0.0362
MSE: 464525855.96
✅ Model saved as 'month_year_sales_model.pkl'


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error
import joblib

# Load your updated dataset
df = pd.read_excel("final_professional_sales_dataset_with_month_year.xlsx")

# Encode categorical variables
label_encoders = {}
categorical_columns = ['State', 'District', 'Category']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df[['State', 'District', 'Category', 'Month', 'Year']]
y = df['Quantity Sold']  # Predicting quantity sold

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")

# Save the model and encoders
joblib.dump(model, "sales_prediction_model.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
print("✅ Model and encoders saved successfully.")


R2 Score: -0.1124
MSE: 760.07
✅ Model and encoders saved successfully.
