In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from datetime import datetime

In [2]:
df = pd.read_excel('recomendation.xlsx')

# First we need to remove some features 
- because these features are noise and incorrect we need to first drop these features and create same features using feature engineering on the basis of available itemID for better analysis

In [3]:
df=df.drop(columns=['PurchaseHistory','CartActivity','WishlistActivity','AbandonedCartData','BrowsingHistory'])

In [4]:
pd.set_option("display.max_columns",None)
df.sample(2)

Unnamed: 0,UserID,ItemID,Rating,Clicks,Views,TimeSpentOnItem,PurchaseDate,SessionDuration,DeviceType,Age,Gender,Location,Income,Occupation,SignUpDate,MembershipLevel,Device,TimeOfInteraction,SearchQueries,ProductName,Category,Price,Discount,Brand,Description,Tags,Color,Size,Stock,Ratings,Reviews,ReleaseDate,PopularityScore
2749,U02814,Item00923,2,4,30,165.28,NaT,53.13,Desktop,69,Male,"West Johnmouth, CG",Medium,Chief Technology Officer,2023-09-11,Regular,Desktop,Evening,"['our', 'happen', 'late', 'hard', 'various']",Suddenly,Sports,21.14,5.36,Stokes LLC,Particular son include food form firm back sum...,"['give', 'lay', 'we', 'describe']",Red,S,109,4.8,Section together responsibility type career wh...,2020-01-17,29.86
2728,U02598,Item00786,4,9,11,20.44,NaT,64.57,Mobile,44,Male,"West Tylerside, IS",Medium,Armed forces operational officer,2021-09-12,Regular,Desktop,Morning,"['affect', 'American', 'rich']",Style,Sports,271.11,27.37,Edwards LLC,Matter player have heavy society chance role p...,"['lawyer', 'across', 'safe', 'gun']",Green,S,118,2.4,Should wish account point decide we Congress u...,2024-04-27,4.54


# Now we can create new feature

In [5]:
# Extract unique ItemIDs for random sampling
unique_item_ids = df['ItemID'].unique()

# Function to assign a random list of ItemIDs
def assign_random_items(item_list, n=3):
    return list(np.random.choice(item_list, size=n, replace=False))

# Create new columns with random ItemIDs
df['PurchaseHistory'] = df['UserID'].apply(lambda _: assign_random_items(unique_item_ids))
df['CartActivity'] = df['UserID'].apply(lambda _: assign_random_items(unique_item_ids))
df['WishlistActivity'] = df['UserID'].apply(lambda _: assign_random_items(unique_item_ids))
df['AbandonedCartData'] = df['UserID'].apply(lambda _: assign_random_items(unique_item_ids))
df['BrowsingHistory'] = df['UserID'].apply(lambda _: assign_random_items(unique_item_ids))

# Display the first few rows to verify
df[['UserID', 'PurchaseHistory', 'CartActivity', 'WishlistActivity', 'AbandonedCartData', 'BrowsingHistory']].head()

Unnamed: 0,UserID,PurchaseHistory,CartActivity,WishlistActivity,AbandonedCartData,BrowsingHistory
0,U03449,"[Item00756, Item00003, Item00767]","[Item00090, Item00523, Item00730]","[Item00399, Item00617, Item00345]","[Item00937, Item00481, Item00540]","[Item00517, Item00191, Item00468]"
1,U01475,"[Item00597, Item00812, Item00737]","[Item00498, Item00780, Item00792]","[Item00423, Item00534, Item00211]","[Item00730, Item00082, Item00551]","[Item00591, Item00687, Item00731]"
2,U02136,"[Item00299, Item00079, Item00402]","[Item00153, Item00314, Item00576]","[Item00897, Item00695, Item00207]","[Item00328, Item00929, Item00038]","[Item00472, Item00281, Item00245]"
3,U00332,"[Item00985, Item00651, Item00617]","[Item00238, Item00972, Item00769]","[Item00774, Item00355, Item00446]","[Item00424, Item00034, Item00449]","[Item00662, Item00311, Item00185]"
4,U04194,"[Item00386, Item00991, Item00888]","[Item00036, Item00879, Item00424]","[Item00017, Item00917, Item00640]","[Item00820, Item00187, Item00901]","[Item00923, Item00195, Item00528]"


# Handle Missing Values
- first we can handle the missing values in some features like ProductName ,Size,PurchaseDate

In [6]:
# first we can fill null values in productname and size
df['ProductName'].fillna('Unknown Product',inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ProductName'].fillna('Unknown Product',inplace=True)


In [7]:
# According to our Literature Review we do not need of Purchase date so we can decidede to drop this feature
df.drop('PurchaseDate', axis=1, inplace=True)


- We have decided to drop the Purchase Date feature as it does not contribute significantly to the objectives of this analysis. Based on the literature review, the focus is on customer behavior and product preferences, and temporal details like the purchase date are not relevant for the current model. Removing this feature helps streamline the dataset and improve analysis efficiency.

In [8]:
df.isnull().sum()

UserID                 0
ItemID                 0
Rating                 0
Clicks                 0
Views                  0
TimeSpentOnItem        0
SessionDuration        0
DeviceType             0
Age                    0
Gender                 0
Location               0
Income                 0
Occupation             0
SignUpDate             0
MembershipLevel        0
Device                 0
TimeOfInteraction      0
SearchQueries          0
ProductName            0
Category               0
Price                  0
Discount               0
Brand                  0
Description            0
Tags                   0
Color                  0
Size                 946
Stock                  0
Ratings                0
Reviews                0
ReleaseDate            0
PopularityScore        0
PurchaseHistory        0
CartActivity           0
WishlistActivity       0
AbandonedCartData      0
BrowsingHistory        0
dtype: int64

In [9]:
# we can fill the nan values
df['Size'] = df['Size'].apply(lambda x: np.random.choice(df['Size'].dropna()) if pd.isna(x) else x)


In [10]:
df.isnull().sum()

UserID               0
ItemID               0
Rating               0
Clicks               0
Views                0
TimeSpentOnItem      0
SessionDuration      0
DeviceType           0
Age                  0
Gender               0
Location             0
Income               0
Occupation           0
SignUpDate           0
MembershipLevel      0
Device               0
TimeOfInteraction    0
SearchQueries        0
ProductName          0
Category             0
Price                0
Discount             0
Brand                0
Description          0
Tags                 0
Color                0
Size                 0
Stock                0
Ratings              0
Reviews              0
ReleaseDate          0
PopularityScore      0
PurchaseHistory      0
CartActivity         0
WishlistActivity     0
AbandonedCartData    0
BrowsingHistory      0
dtype: int64

# Encode Categorical Variables
- For categorical features like Gender, Location, and MembershipLevel, we can use LabelEncoder is used for simplicity.

In [11]:
label_encoders = {}
for col in ['Gender','Location','MembershipLevel','Occupation','DeviceType','Category','Brand','Color','Size']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Convert Date Columns to Numeric
- We can Extract relevant features from PurchaseDate, SignUpDate, and ReleaseDate, like the year, month, day, or day of the week, which may affect purchasing behavior.

In [12]:
date_columns = ['SignUpDate','ReleaseDate']

for col in date_columns:
    df[col + '_Year'] = df[col].dt.year
    df[col + '_Month'] = df[col].dt.month
    df[col + '_Day'] = df[col].dt.day
    df[col + '_DayOfWeek'] = df[col].dt.dayofweek

df.drop(columns = date_columns,inplace = True)

In [15]:
df.head(1)

Unnamed: 0,UserID,ItemID,Rating,Clicks,Views,TimeSpentOnItem,SessionDuration,DeviceType,Age,Gender,Location,Income,Occupation,MembershipLevel,Device,TimeOfInteraction,SearchQueries,ProductName,Category,Price,Discount,Brand,Description,Tags,Color,Size,Stock,Ratings,Reviews,PopularityScore,PurchaseHistory,CartActivity,WishlistActivity,AbandonedCartData,BrowsingHistory,SignUpDate_Year,SignUpDate_Month,SignUpDate_Day,SignUpDate_DayOfWeek,ReleaseDate_Year,ReleaseDate_Month,ReleaseDate_Day,ReleaseDate_DayOfWeek
0,U03449,Item00047,2,3,42,280.46,49.98,0,20,0,1148,Medium,184,0,Desktop,Afternoon,"['model', 'recent', 'training', 'through', 'co...",Everything,2,504.25,8.25,693,Find Mrs family guess camera image decide able...,['event'],4,1,24,3.9,Happen so late space nothing fill mouth health...,38.13,"[Item00756, Item00003, Item00767]","[Item00090, Item00523, Item00730]","[Item00399, Item00617, Item00345]","[Item00937, Item00481, Item00540]","[Item00517, Item00191, Item00468]",2023,7,11,1,2020,9,22,1


# Scale Numerical Columns
- we can Scale numerical columns to a similar range using MinMaxScaler

In [None]:
scaler = MinMaxScaler()
numerical_columns = ['Clicks', 'Views', 'TimeSpentOnItem', 'SessionDuration', 'Age', 'Price', 'Discount', 'Stock', 'Ratings', 'PopularityScore']

df[numerical_columns] = scaler.fit_transform(df[numerical_columns])


In [16]:
scaler = MinMaxScaler()
numerical_columns = ['Clicks','Views','TimeSpentOnItem','SessionDuration','Age','Price','Discount','Stock','Ratings','PopularityScore']

df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [17]:
df.sample()

Unnamed: 0,UserID,ItemID,Rating,Clicks,Views,TimeSpentOnItem,SessionDuration,DeviceType,Age,Gender,Location,Income,Occupation,MembershipLevel,Device,TimeOfInteraction,SearchQueries,ProductName,Category,Price,Discount,Brand,Description,Tags,Color,Size,Stock,Ratings,Reviews,PopularityScore,PurchaseHistory,CartActivity,WishlistActivity,AbandonedCartData,BrowsingHistory,SignUpDate_Year,SignUpDate_Month,SignUpDate_Day,SignUpDate_DayOfWeek,ReleaseDate_Year,ReleaseDate_Month,ReleaseDate_Day,ReleaseDate_DayOfWeek
2524,U02669,Item00781,4,0.263158,0.612245,0.861145,0.798637,0,0.686275,1,396,Medium,181,2,Mobile,Morning,['south'],Ten,3,0.111956,0.94865,8,Notice church child long second poor particula...,"['human', 'station', 'include']",0,3,0.035176,0.35,Safe ball service something bad dark term imag...,0.222334,"[Item00413, Item00913, Item00521]","[Item00829, Item00443, Item00416]","[Item00566, Item00676, Item00728]","[Item00343, Item00683, Item00494]","[Item00999, Item00368, Item00644]",2023,3,13,0,2022,9,6,1
