In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_csv("sales_pos.csv")
df.head(2)

Unnamed: 0,user,prod,gender,age_group,job,city,marital,prod_cat1,prod_cat2,prod_cat3,purchase
0,1,P00069042,F,0-17,10,A,0,3,,,8370
1,1,P00248942,F,0-17,10,A,0,1,6.0,14.0,15200


### Q1.

In [3]:
df_q1 = df[["prod", "job", "purchase"]].copy()
df_q1.head(2)

Unnamed: 0,prod,job,purchase
0,P00069042,10,8370
1,P00248942,10,15200


In [4]:
df_q1_agg = df_q1.groupby("prod")["purchase"].sum()
df_q1_agg.head()

prod
P00000142    12837476
P00000242     3967496
P00000342     1296475
P00000442      441173
P00000542      807212
Name: purchase, dtype: int64

In [5]:
df_q1_agg.max()

27995166

In [6]:
top_prod = df_q1.groupby("prod")["purchase"].sum().idxmax()
top_prod

'P00025442'

In [7]:
df_q1.loc[df_q1["prod"] == top_prod, "job"].value_counts().idxmax()

4

### Q2.

In [8]:
df_u1 = df.loc[df["user"] == 1, ["prod_cat1", "prod_cat2", "prod_cat3"]]
df_u1 = df_u1.reset_index(drop = True)
df_u1 = df_u1.fillna(0)

In [9]:
len(df_u1)

35

In [10]:
df_u1.head(2)

Unnamed: 0,prod_cat1,prod_cat2,prod_cat3
0,3,0.0,0.0
1,1,6.0,14.0


In [11]:
df_u1.drop_duplicates().shape

(21, 3)

In [12]:
df_u1["prod_cat1"] = df_u1["prod_cat1"].astype("int").astype("str")
df_u1["prod_cat2"] = df_u1["prod_cat2"].astype("int").astype("str")
df_u1["prod_cat3"] = df_u1["prod_cat3"].astype("int").astype("str")
df_u1["prod_cat" ] = df_u1["prod_cat1"] + "-" + df_u1["prod_cat2"] + "-" + df_u1["prod_cat3"]

In [13]:
df_u1["prod_cat"].nunique()

21

In [14]:
df_u1.head()

Unnamed: 0,prod_cat1,prod_cat2,prod_cat3,prod_cat
0,3,0,0,3-0-0
1,1,6,14,1-6-14
2,12,0,0,12-0-0
3,12,14,0,12-14-0
4,2,4,8,2-4-8


In [15]:
df["age_group"].unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [16]:
col_q2 = ["user", "marital", "prod_cat1", "prod_cat2", "prod_cat3"]
df_q2 = df.loc[df["age_group"] == "26-35", col_q2].reset_index(drop = True)

In [17]:
df_q2 = df_q2.fillna(0)
df_q2["prod_cat1"] = df_q2["prod_cat1"].astype("int").astype("str")
df_q2["prod_cat2"] = df_q2["prod_cat2"].astype("int").astype("str")
df_q2["prod_cat3"] = df_q2["prod_cat3"].astype("int").astype("str")
df_q2["prod_cat" ] = df_q2["prod_cat1"] + "-" + df_q2["prod_cat2"] + "-" + df_q2["prod_cat3"]

In [18]:
df_q2.head(2)

Unnamed: 0,user,marital,prod_cat1,prod_cat2,prod_cat3,prod_cat
0,3,0,1,2,0,1-2-0
1,5,1,8,0,0,8-0-0


In [19]:
# df_q2_agg = df_q2.groupby("user")["prod_cat"].nunique()
df_q2_agg = df_q2.groupby(["user", "marital"])["prod_cat"].nunique()
df_q2_agg = df_q2_agg.reset_index()
df_q2_agg.head()

Unnamed: 0,user,marital,prod_cat
0,3,0,18
1,5,1,43
2,8,1,32
3,9,0,31
4,11,0,34


In [20]:
df_q2_agg = df_q2_agg.reset_index()
df_q2_agg.head()

Unnamed: 0,index,user,marital,prod_cat
0,0,3,0,18
1,1,5,1,43
2,2,8,1,32
3,3,9,0,31
4,4,11,0,34


In [21]:
stat_m0 = df_q2_agg.loc[df_q2_agg["marital"] == 0, "prod_cat"].mean()
stat_m1 = df_q2_agg.loc[df_q2_agg["marital"] == 1, "prod_cat"].mean()
round(abs(stat_m0 - stat_m1), 2)

0.13

### Q3.

In [22]:
df_user = df[["user", "gender", "age_group", "job", "city", "marital"]]
df_user = df_user.drop_duplicates().reset_index(drop = True)
df_user.head(2)

Unnamed: 0,user,gender,age_group,job,city,marital
0,1,F,0-17,10,A,0
1,2,M,55+,16,C,0


In [23]:
df["user"].nunique(), len(df_user)

(5891, 5891)

In [24]:
df_agg = df.groupby("user")[["prod", "purchase"]].agg({"prod": "nunique",
                                                       "purchase": "sum"})
df_agg = df_agg.reset_index()
df_agg.head(2)

Unnamed: 0,user,prod,purchase
0,1,35,334093
1,2,77,810472


In [25]:
df_q3 = df_user.merge(df_agg, on = "user")
df_q3.head(3)

Unnamed: 0,user,gender,age_group,job,city,marital,prod,purchase
0,1,F,0-17,10,A,0,35,334093
1,2,M,55+,16,C,0,77,810472
2,3,M,26-35,15,A,0,29,341635


In [26]:
df_q3["gender"] = df_q3["gender"].replace({"M": 1, "F": 0})

In [27]:
arr_u = df_q3["age_group"].unique()
arr_u.sort()
arr_u

array(['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+'],
      dtype=object)

In [28]:
ser_repl = pd.Series(range(len(arr_u)), index = arr_u)
ser_repl

0-17     0
18-25    1
26-35    2
36-45    3
46-50    4
51-55    5
55+      6
dtype: int64

In [29]:
pd.crosstab(df_q3["age_group"], df_q3["age_group"].replace(ser_repl))

age_group,0,1,2,3,4,5,6
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0-17,218,0,0,0,0,0,0
18-25,0,1069,0,0,0,0,0
26-35,0,0,2053,0,0,0,0
36-45,0,0,0,1167,0,0,0
46-50,0,0,0,0,531,0,0
51-55,0,0,0,0,0,481,0
55+,0,0,0,0,0,0,372


In [30]:
df_q3["age_group"] = df_q3["age_group"].replace(ser_repl)

In [31]:
df_q3_dum = pd.get_dummies(df_q3, columns = ["job", "city"],
                           dtype = "int")
df_q3_dum.head(2)

Unnamed: 0,user,gender,age_group,marital,prod,purchase,job_0,job_1,job_2,job_3,...,job_14,job_15,job_16,job_17,job_18,job_19,job_20,city_A,city_B,city_C
0,1,0,0,0,35,334093,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,1,6,0,77,810472,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [32]:
arr_q3_nor = MinMaxScaler().fit_transform(df_q3_dum.drop(columns = "user"))
arr_q3_nor[:1, ]

array([[0.        , 0.        , 0.        , 0.02843137, 0.02739807,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        ]])

In [33]:
arr_q3_nor2 = df_q3_dum.copy()
arr_q3_nor2.loc[:,'gender':'city_C'] =  MinMaxScaler().fit_transform(df_q3_dum.loc[:,'gender':'city_C'])
arr_q3_nor2

Unnamed: 0,user,gender,age_group,marital,prod,purchase,job_0,job_1,job_2,job_3,...,job_14,job_15,job_16,job_17,job_18,job_19,job_20,city_A,city_B,city_C
0,1,0,0.000000,0,0.028431,0.027398,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,1,1.000000,0,0.069608,0.072810,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,3,1,0.333333,0,0.022549,0.028117,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,4,1,0.666667,1,0.007843,0.015232,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,1,0.333333,1,0.098039,0.073813,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5886,4588,0,0.333333,0,0.013725,0.008990,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5887,4871,1,0.166667,0,0.005882,0.005897,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5888,4113,1,0.500000,0,0.013725,0.015907,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
5889,5391,1,0.333333,0,0.000980,0.001287,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [34]:
model_kmeans = KMeans(n_clusters = 7, random_state = 123)
model_kmeans.fit(arr_q3_nor2.loc[:,'gender':'city_C'])



In [38]:
val_sil = silhouette_score(arr_q3_nor, 
                           labels = model_kmeans.labels_)
val_sil

0.17924689113287728

In [36]:
round(val_sil, 2)

0.18