In [25]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
pip install sentence-transformers



In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [28]:
# Load datasets
customers = pd.read_csv("/content/drive/My Drive/SuperTeam_Write/customer_details.csv")
purchases = pd.read_csv("/content/drive/My Drive/SuperTeam_Write/purchases.csv")
sessions = pd.read_csv("/content/drive/My Drive/SuperTeam_Write/sessions.csv")
reviews = pd.read_csv("/content/drive/My Drive/SuperTeam_Write/reviews.csv")

In [29]:
reviews.head()

Unnamed: 0,review_id,customer_id,product_id,review_text,review_rating
0,REV0001,CUST001,PROD017,Film enjoy ground.,4
1,REV0002,CUST001,PROD038,Have message million issue detail wait research.,1
2,REV0003,CUST001,PROD010,Stand health face from into side.,4
3,REV0004,CUST002,PROD014,Pull second central deep catch why join.,4
4,REV0005,CUST002,PROD002,Purpose by seem them during between.,4


In [30]:
customer_org = customers.copy()
sessions_org = sessions.copy()
purchases_org = purchases.copy()
reviews_org = reviews.copy()

In [31]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Initialize encoders and scalers
label_encoder = LabelEncoder()
scaler = MinMaxScaler()

# Encode categorical features
customers['gender'] = label_encoder.fit_transform(customers['gender'])
sessions['page_viewed'] = label_encoder.fit_transform(sessions['page_viewed'])

# Normalize numeric features
customers[['age']] = scaler.fit_transform(customers[['age']])
sessions[['view_duration']] = scaler.fit_transform(sessions[['view_duration']])
purchases[['purchase_amount']] = scaler.fit_transform(purchases[['purchase_amount']])

In [32]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load your CSV file
file_path = '/content/drive/My Drive/SuperTeam_Write/reviews.csv'  # Update this path with the correct file path
reviews_df = pd.read_csv(file_path)

# Extract the review texts
review_texts = reviews_df['review_text'].tolist()

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the review texts to embeddings
embeddings = model.encode(review_texts)

# Add the embeddings to the DataFrame
reviews_df['review_embedding'] = embeddings.tolist()

In [33]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Transform texts to vectors and store them in a new column
reviews_df['review_embedding'] = reviews_df['review_text'].apply(lambda x: model.encode(x).tolist())

In [34]:
reviews_df.to_csv("/content/drive/My Drive/SuperTeam_Write/reviews_embed.csv", index=False)

In [35]:
reviews_df

Unnamed: 0,review_id,customer_id,product_id,review_text,review_rating,review_embedding
0,REV0001,CUST001,PROD017,Film enjoy ground.,4,"[0.0039257993921637535, -0.06895976513624191, ..."
1,REV0002,CUST001,PROD038,Have message million issue detail wait research.,1,"[0.008132816292345524, -0.021024635061621666, ..."
2,REV0003,CUST001,PROD010,Stand health face from into side.,4,"[-0.017464205622673035, 0.1284857541322708, -0..."
3,REV0004,CUST002,PROD014,Pull second central deep catch why join.,4,"[-0.004855050239712, -0.000586566689889878, 0...."
4,REV0005,CUST002,PROD002,Purpose by seem them during between.,4,"[-0.018379051238298416, 0.04018454626202583, 0..."
...,...,...,...,...,...,...
297,REV0298,CUST099,PROD038,Official still together management approach say.,3,"[-0.016435258090496063, 0.007368630263954401, ..."
298,REV0299,CUST099,PROD006,Teacher alone ago hard edge.,4,"[-0.021841278299689293, -0.04481922462582588, ..."
299,REV0300,CUST099,PROD040,Let still beautiful again business.,5,"[0.04072361811995506, 0.0028997601475566626, 0..."
300,REV0301,CUST100,PROD031,Green shake significant sure much teacher but.,4,"[-0.0031900713220238686, -0.01505244616419077,..."


In [36]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_id         302 non-null    object
 1   customer_id       302 non-null    object
 2   product_id        302 non-null    object
 3   review_text       302 non-null    object
 4   review_rating     302 non-null    int64 
 5   review_embedding  302 non-null    object
dtypes: int64(1), object(5)
memory usage: 14.3+ KB


In [39]:
from sklearn.decomposition import PCA
import numpy as np

reviews_df['review_embedding'] = reviews_df['review_embedding'].apply(
    lambda x: np.array(x) if isinstance(x, list) else np.fromstring(x.strip("[]"), sep=',')
)

# Convert the review embeddings from string to numpy array
# reviews_df['review_embedding'] = reviews_df['review_embedding'].apply(
#     lambda x: np.fromstring(x.strip("[]"), sep=',')
# )

# Aggregate the review embeddings for each customer by taking the mean of their embeddings
customer_embeddings = reviews_df.groupby('customer_id')['review_embedding'].apply(np.mean).reset_index()

# Grouping by customer_id and aggregating numeric columns only
# Sessions
numeric_sessions_df = sessions.select_dtypes(include=[np.number])
sessions_agg_df = sessions[['customer_id']].join(numeric_sessions_df).groupby('customer_id').mean().reset_index()

# Purchases
numeric_purchases_df = purchases.select_dtypes(include=[np.number])
purchases_agg_df = purchases[['customer_id']].join(numeric_purchases_df).groupby('customer_id').mean().reset_index()

# Merge customer embeddings with other customer features
merged_df = customers.merge(customer_embeddings, on='customer_id', how='left')
merged_df = merged_df.merge(sessions_agg_df, on='customer_id', how='left')
merged_df = merged_df.merge(purchases_agg_df, on='customer_id', how='left')

# Fill any missing values that resulted from the merge
merged_df = merged_df.fillna(0)

# Combine all features into a single vector for each customer
feature_columns = ['age', 'gender', 'view_duration', 'purchase_amount', 'review_embedding']
merged_df['combined_vector'] = merged_df.apply(
    lambda row: np.concatenate([row['review_embedding'], [row['age'], row['gender'], row['view_duration'], row['purchase_amount']]]),
    axis=1
)

# Convert the combined_vector and review_embedding back to string format
merged_df['combined_vector'] = merged_df['combined_vector'].apply(lambda x: str(list(x)))
merged_df['review_embedding'] = merged_df['review_embedding'].apply(lambda x: str(list(x)))

# Display the first few rows after combining vectors
print(merged_df.head())


  customer_id       age  gender            location signup_date  \
0     CUST001  0.723404       0         Robertsbury  2023-11-21   
1     CUST002  0.468085       1     Lake Heidimouth  2023-08-10   
2     CUST003  0.212766       1  South Stephenville  2024-05-25   
3     CUST004  0.957447       1         Brandyshire  2023-11-05   
4     CUST005  0.106383       0          Wrightberg  2023-03-10   

                                    review_embedding  page_viewed  \
0  [-0.0018018633127212524, 0.012833784644802412,...     5.800000   
1  [0.01261108291024963, 0.026716904966936756, 0....     7.454545   
2  [-0.01616765884682536, 0.018640945293009283, 0...    17.500000   
3  [-0.012293482199311256, 0.05395197914913297, 0...     2.125000   
4  [0.046963881701231, 0.04533408582210541, 0.069...    14.428571   

   view_duration  purchase_amount  \
0       0.443051         0.356144   
1       0.536518         0.438716   
2       0.611582         0.663063   
3       0.520551         0.425212 

In [40]:
# Apply PCA to reduce the dimensionality of the combined vectors
pca = PCA(n_components=5)  # Set n_components to a feasible value, in this case 5
reduced_vectors = pca.fit_transform(np.stack(merged_df['combined_vector'].apply(eval)))

# Assign the reduced vectors to a new column and convert to JSON-like string format
merged_df['reduced_vector'] = reduced_vectors.tolist()
merged_df['reduced_vector'] = merged_df['reduced_vector'].apply(lambda x: str(x))

# Display the first few rows after adding the reduced vectors
print(merged_df.head())

  customer_id       age  gender            location signup_date  \
0     CUST001  0.723404       0         Robertsbury  2023-11-21   
1     CUST002  0.468085       1     Lake Heidimouth  2023-08-10   
2     CUST003  0.212766       1  South Stephenville  2024-05-25   
3     CUST004  0.957447       1         Brandyshire  2023-11-05   
4     CUST005  0.106383       0          Wrightberg  2023-03-10   

                                    review_embedding  page_viewed  \
0  [-0.0018018633127212524, 0.012833784644802412,...     5.800000   
1  [0.01261108291024963, 0.026716904966936756, 0....     7.454545   
2  [-0.01616765884682536, 0.018640945293009283, 0...    17.500000   
3  [-0.012293482199311256, 0.05395197914913297, 0...     2.125000   
4  [0.046963881701231, 0.04533408582210541, 0.069...    14.428571   

   view_duration  purchase_amount  \
0       0.443051         0.356144   
1       0.536518         0.438716   
2       0.611582         0.663063   
3       0.520551         0.425212 

In [42]:
# merging the original customer information with the new features for the customer similarity search
df = customer_org.merge(merged_df[['customer_id', 'page_viewed', 'view_duration', 'purchase_amount', 'review_embedding', 'combined_vector', 'reduced_vector', 'review_embedding']], on='customer_id', how='left')

In [43]:
# save to csv file
df.to_csv("/content/drive/My Drive/SuperTeam_Write/customers_segment.csv", index=False)