# Data Cleaning

In [26]:
import pandas as pd

# Load the dataset
file_path = "./sales_data/Sales_Data.csv"  # Change this to the actual file path
df = pd.read_csv(file_path)

# Step 1: Strip spaces from column names
df.columns = df.columns.str.strip()

# Step 2: Convert 'Order Date' to datetime format
df["Order Date"] = pd.to_datetime(df["Order Date"], errors="coerce")

# Step 3: Drop rows with missing values
df.dropna(inplace=True)

# Step 4: Convert 'Quantity Ordered' and 'Price Each' to numeric types
df["Quantity Ordered"] = pd.to_numeric(df["Quantity Ordered"], errors="coerce")
df["Price Each"] = pd.to_numeric(df["Price Each"], errors="coerce")

# Step 5: Verify Sales calculation
df["Sales"] = df["Quantity Ordered"] * df["Price Each"]

# Step 6: Extract city names from 'Purchase Address'
df["City"] = df["Purchase Address"].apply(lambda x: x.split(",")[1].strip())

# Step 7: Extract hour from 'Order Date' column
df["Hour"] = df["Order Date"].dt.hour

# Save the cleaned data
cleaned_file_path = "sales_data_cleaned.csv"
df.to_csv(cleaned_file_path, index=False)

print("Data cleaning completed. Cleaned file saved as:", cleaned_file_path)


Data cleaning completed. Cleaned file saved as: sales_data_cleaned.csv


In [27]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

# Load cleaned data
df = pd.read_csv("./sales_data/sales_data_cleaned.csv")

# Feature selection: Using 'Quantity Ordered', 'Price Each', 'Hour' to predict 'Sales'
X = df[["Quantity Ordered", "Price Each", "Hour"]]
y = df["Sales"]

# Splitting data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save processed data
with open("processed_data.pkl", "wb") as data_file:
    pickle.dump((X_train, X_test, y_train, y_test), data_file)

print("Data preparation complete. Processed data saved successfully.")


Data preparation complete. Processed data saved successfully.


# Linear Regression

In [44]:
import os
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Create a folder if it doesn't exist
folder_name = "Trained_models"
os.makedirs(folder_name, exist_ok=True)

# Load preprocessed data
with open("processed_data.pkl", "rb") as data_file:
    X_train, X_test, y_train, y_test = pickle.load(data_file)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Save the model and scaler in the created folder
with open(os.path.join(folder_name, "lr_model.pkl"), "wb") as model_file:
    pickle.dump(lr_model, model_file)

with open(os.path.join(folder_name, "scaler.pkl"), "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

print(f"Model training complete. Model and scaler saved in '{folder_name}' folder successfully.")


Model training complete. Model and scaler saved in 'Trained_models' folder successfully.


In [45]:
import pickle
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load preprocessed data
with open("processed_data.pkl", "rb") as data_file:
    _, X_test, _, y_test = pickle.load(data_file)

# Load the trained model and scaler
with open("./Trained_models/lr_model.pkl", "rb") as model_file:
    lr_model = pickle.load(model_file)

with open("./Trained_models/scaler.pkl", "rb") as scaler_file:
    scaler = pickle.load(scaler_file)

# Standardize the test data
X_test_scaled = scaler.transform(X_test)

# Predictions
y_pred_lr = lr_model.predict(X_test_scaled)

# Convert sales into binary classes (high = 1, low = 0) using the median as a threshold
threshold = y_test.median()
y_test_class = (y_test >= threshold).astype(int)
y_pred_class = (y_pred_lr >= threshold).astype(int)

# Compute classification metrics
accuracy = accuracy_score(y_test_class, y_pred_class) * 100
precision = precision_score(y_test_class, y_pred_class) * 100
recall = recall_score(y_test_class, y_pred_class) * 100
f1 = f1_score(y_test_class, y_pred_class) * 100

# Print only Accuracy, Precision, Recall, and F1-score
print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1-score: {f1:.2f}%")


Accuracy: 98.26%
Precision: 97.09%
Recall: 100.00%
F1-score: 98.52%


# Random Forest model

In [46]:
import pickle
import os
from sklearn.ensemble import RandomForestRegressor

folder_name = "Trained_models"
os.makedirs(folder_name, exist_ok=True)

# Load preprocessed data
with open("processed_data.pkl", "rb") as data_file:
    X_train, _, y_train, _ = pickle.load(data_file)

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Save the trained model
model_path = os.path.join(folder_name, "rf_model.pkl")
with open(model_path, "wb") as model_file:
    pickle.dump(rf_model, model_file)
print(f"Random Forest model trained and saved successfully to {model_path}.")

Random Forest model trained and saved successfully to Trained_models/rf_model.pkl.


In [47]:
import pickle
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load preprocessed data
with open("processed_data.pkl", "rb") as data_file:
    _, X_test, _, y_test = pickle.load(data_file)

# Load the trained Random Forest model
with open("./Trained_models/rf_model.pkl", "rb") as model_file:
    rf_model = pickle.load(model_file)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Convert sales into binary classes (high = 1, low = 0) using the median as a threshold
threshold = y_test.median()
y_test_class = (y_test >= threshold).astype(int)
y_pred_class = (y_pred_rf >= threshold).astype(int)

# Compute classification metrics
accuracy = accuracy_score(y_test_class, y_pred_class) * 100
precision = precision_score(y_test_class, y_pred_class) * 100
recall = recall_score(y_test_class, y_pred_class) * 100
f1 = f1_score(y_test_class, y_pred_class) * 100

# Print only Accuracy, Precision, Recall, and F1-score
print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1-score: {f1:.2f}%")


Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%


# Decision Tree model

In [48]:
import pickle
import os
from sklearn.tree import DecisionTreeRegressor

folder_name = "Trained_models"
os.makedirs(folder_name, exist_ok=True)

# Load preprocessed data
with open("processed_data.pkl", "rb") as data_file:
    X_train, _, y_train, _ = pickle.load(data_file)

# Train Decision Tree Model
dt_model = DecisionTreeRegressor(random_state=42)  # You can add hyperparameters here
dt_model.fit(X_train, y_train)

# Save the trained model
model_path = os.path.join(folder_name, "dt_model.pkl")
with open(model_path, "wb") as model_file:
    pickle.dump(dt_model, model_file)
print(f"Decision Tree model trained and saved successfully to {model_path}.")

Decision Tree model trained and saved successfully to Trained_models/dt_model.pkl.


In [49]:
import pickle
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load preprocessed data
with open("processed_data.pkl", "rb") as data_file:
    _, X_test, _, y_test = pickle.load(data_file)

# Load the trained Decision Tree model
with open("./Trained_models/dt_model.pkl", "rb") as model_file:
    dt_model = pickle.load(model_file)

# Predictions
y_pred_dt = dt_model.predict(X_test)

# Convert sales into binary classes (high = 1, low = 0) using the median as a threshold
threshold = y_test.median()
y_test_class = (y_test >= threshold).astype(int)
y_pred_class = (y_pred_dt >= threshold).astype(int)  # Use Decision Tree predictions

# Compute classification metrics
accuracy = accuracy_score(y_test_class, y_pred_class) * 100
precision = precision_score(y_test_class, y_pred_class) * 100
recall = recall_score(y_test_class, y_pred_class) * 100
f1 = f1_score(y_test_class, y_pred_class) * 100

# Print only Accuracy, Precision, Recall, and F1-score
print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1-score: {f1:.2f}%")

Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%


In [65]:
import pickle
import pandas as pd

# Example DataFrame, replace with your actual data
file_path = "./sales_data/Sales_Data.csv" 
df = pd.read_csv(file_path)  # Replace with the correct data loading

# Split the data into features (X) and labels (y)
X = df[["Quantity Ordered", "Price Each", "Hour"]]  # Example features
y = df["Sales"]  # Example label, replace with your actual label column

# Save the preprocessed data
with open("processed_data.pkl", "wb") as data_file:
    pickle.dump((X, None, y, None), data_file)

# Now, this processed_data.pkl can be loaded again later and will contain X_test and y_test properly


In [68]:
import pickle
import os
import pandas as pd
import requests
import json

# Load the trained Decision Tree model directly from the 'Trained_models' folder
folder_name = "Trained_models"
model_path = os.path.join(folder_name, "lr_model.pkl")
with open(model_path, "rb") as model_file:
    dt_model = pickle.load(model_file)

# Load your preprocessed data for prediction
with open("processed_data.pkl", "rb") as data_file:
    X_test, _, y_test, _ = pickle.load(data_file)

# Check if the data was loaded correctly
print(f"X_test shape: {getattr(X_test, 'shape', 'No shape')}")
print(f"y_test length: {len(y_test)}")

# Set up the Gemini API URL and API key
GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent"
API_KEY = "AIzaSyDs_6CZdiVLoBaZZltYrQC5FiGgf9gXGfU"  # Replace with your actual API key

# Function to get the response from Gemini API
def chatbot_response(user_text, model, X_test, y_test):
    headers = {
        "Content-Type": "application/json"
    }

    try:
        # Here, we're assuming the user_input correlates to features in X_test
        # You might need to map user input to features before using it for prediction
        user_features = extract_features_from_text(user_text)  # Implement this function based on your needs
        
        # Predict using the decision tree model
        prediction = model.predict([user_features])[0]

        # Prepare context for the chatbot to generate response
        context = f"Based on the analysis, the predicted result is: {prediction}"

        # Prepare the data for the Gemini API
        modified_data = {
            "contents": [{
                "parts": [{"text": f"{user_text}. Context: {context}"}]
            }]
        }

        # Make the POST request to the Gemini API
        response = requests.post(f"{GEMINI_API_URL}?key={API_KEY}", headers=headers, data=json.dumps(modified_data))

        if response.status_code == 200:
            try:
                answer = response.json().get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "Sorry, I couldn't find an answer.")
                return answer
            except (KeyError, IndexError, TypeError):
                return "Invalid response format from Gemini API."
        else:
            return f"Error with Gemini API: {response.text}"

    except Exception as e:
        return f"Error processing input: {e}"

# Placeholder function to extract features from text (replace with actual feature extraction logic)
def extract_features_from_text(user_text):
    # Here, you'd convert the user input into numeric features that match the model's input (X_test).
    # For now, this just returns a random feature (you'll need to replace this).
    return [0] * X_test.shape[1]  # Placeholder (should match the number of features in X_test)

# Example usage
if __name__ == '__main__':
    user_input = "what is this text about"  # Replace with any user input
    response = chatbot_response(user_input, dt_model, X_test, y_test)
    print(f"Response: {response}")


X_test shape: (185950, 3)
y_test length: 185950
Response: The text indicates the result of an analysis, specifically a prediction yielding a numerical value of 185.21274596665765.  Without more context (e.g., what was being analyzed, what method was used for prediction), it's impossible to say what the number *means*.  It could be anything from a predicted stock price to the estimated weight of something to the calculated outcome of a scientific experiment.

