In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv("/content/lastfinal.csv")

In [None]:
# @title
import numpy as np

# Define number of unique customers
num_customers = 30000

# Generate unique customer IDs
customer_ids = [f'C{str(i).zfill(6)}' for i in range(1, num_customers + 1)]

# Assign random frequencies (some customers appear multiple times)
customer_visits = np.random.choice(range(1, 11), size=num_customers, p=[0.5, 0.2, 0.1, 0.08, 0.05, 0.03, 0.02, 0.01, 0.005, 0.005])

# Expand customer list based on visit frequency
expanded_customers = np.repeat(customer_ids, customer_visits)

# Shuffle and replace existing `customer_id` column
df['customer_id'] = np.random.choice(expanded_customers, size=len(df), replace=True)

# Check new distribution
print(df['customer_id'].value_counts().head(10))  # Print top 10 most frequent customers
print(f"Total unique customers: {df['customer_id'].nunique()}")


customer_id
C004007    25
C007970    24
C019252    23
C004750    22
C005585    22
C004876    22
C021488    21
C028965    21
C005119    21
C018479    21
Name: count, dtype: int64
Total unique customers: 26153


In [None]:
# Ensure no extra spaces in column names
df.columns = df.columns.str.strip()

customer_features = df.groupby('customer_id').agg({
        'category': lambda x: x.mode()[0],  # Most frequent product category
        'invoice_no': 'count',  # Total purchases
        'price': ['sum', 'mean'],  # Total and average spending
        'quantity': 'mean',  # Average quantity per purchase
        'shopping_mall': lambda x: x.mode()[0],  # Most visited mall
        }).reset_index()

    # Rename columns
customer_features.columns = ['customer_id', 'most_frequent_category', 'total_purchases',
                                 'total_spending', 'avg_spending', 'avg_quantity', 'most_frequent_mall']

    # Merge back with the main dataset
df = df.merge(customer_features, on='customer_id', how='left')
print("Aggregation successful!")


Aggregation successful!


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
label_enc = LabelEncoder()
df['most_frequent_category'] = label_enc.fit_transform(df['most_frequent_category'])
df['most_frequent_mall'] = label_enc.fit_transform(df['most_frequent_mall'])
df['payment_method'] = label_enc.fit_transform(df['payment_method'])
df['gender'] = label_enc.fit_transform(df['gender'])
df['shopping_mall'] = label_enc.fit_transform(df['shopping_mall'])
df['category'] = label_enc.fit_transform(df['category'])
df['product_name'] = label_enc.fit_transform(df['product_name'])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define features & target
X = df.drop(columns=['product_name', 'invoice_date', 'invoice_no', 'customer_id'])  # Drop non-informative columns
y = df['product_name']  # Target variable

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train model using Decision Tree
model = RandomForestClassifier(n_estimators=50, random_state=1)  # 100 trees for better accuracymodel.fit(X_train, y_train)
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.22      0.24      0.23       399
           1       0.22      0.25      0.24       416
           2       0.19      0.22      0.21       603
           3       0.18      0.19      0.19       572
           4       0.19      0.17      0.18       600
           5       0.25      0.25      0.25       404
           6       0.18      0.20      0.19      1376
           7       0.21      0.19      0.20       197
           8       0.19      0.17      0.18       582
           9       0.19      0.19      0.19       581
          10       0.21      0.24      0.22       608
          11       0.26      0.30      0.28       246
          12       0.25      0.25      0.25       495
          13       0.20      0.20      0.20       607
          14       0.20      0.21      0.20       199
          15       0.22      0.22      0.22       246
          16       0.20      0.20      0.20      1373
          17       0.19    

In [None]:
print(y_test)

21506    35
83670     6
73336     6
52115    18
86010    10
         ..
35102     2
12526    10
17439     6
88082    18
65839    33
Name: product_name, Length: 19892, dtype: int64


In [None]:
print(y_pred)

[35 33 35 ...  6  2 16]


In [None]:
# Convert predictions back to original product names
y_pred_labels = product_encoder.inverse_transform(y_pred)
y_test_labels = product_encoder.inverse_transform(y_test)

# Extract corresponding customer IDs from the test dataset
customer_ids_test = df.loc[y_test.index, 'customer_id'].values  # Get actual customer IDs

# Create a DataFrame with customer_id, actual product, and predicted product
results_df = pd.DataFrame({
    'Customer_ID': customer_ids_test,
    'Actual_Product': y_test_labels,
    'Predicted_Product': y_pred_labels
})

# Display the first few rows
print(results_df.head(10))  # Show first 10 rows

# Evaluate performance
print(classification_report(y_test, y_pred))

  Customer_ID Actual_Product Predicted_Product
0     C018563        T-shirt           T-shirt
1     C013578          Dress           Sweater
2     C022117          Dress           T-shirt
3     C024087          Juice             Chips
4     C025193     Face Cream          Lipstick
5     C026012         Jacket             Jeans
6     C015901     Snow Globe          Keychain
7     C027232          Chips             Juice
8     C012771   Formal Shoes          Sneakers
9     C005783         Jacket           T-shirt
              precision    recall  f1-score   support

           0       0.19      0.20      0.19       399
           1       0.20      0.20      0.20       416
           2       0.22      0.21      0.22       603
           3       0.19      0.19      0.19       572
           4       0.18      0.18      0.18       600
           5       0.21      0.20      0.21       404
           6       0.20      0.21      0.21      1376
           7       0.21      0.19      0.20       