In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Final_Df.csv")

In [3]:
df.groupby("CustomerID")["Invoice_Value"].sum().quantile([0.33,0.66])

0.33    1089.502190
0.66    3380.198748
Name: Invoice_Value, dtype: float64

In [4]:
def value(row):
    if row["Invoice_Value"]<=1089:
        row["Value"]="Low"
    elif (row["Invoice_Value"]>=1089) & (row["Invoice_Value"]<=3380):
        row["Value"] = "Medium"
    else:
        row["Value"] = "High"
    return row

In [5]:
df.groupby("CustomerID").sum().apply(value,axis=1)["Value"]

CustomerID
12346       Low
12347      High
12348    Medium
12350    Medium
12356    Medium
          ...  
18259       Low
18260    Medium
18269       Low
18277       Low
18283      High
Name: Value, Length: 1468, dtype: object

In [6]:
data = df.drop(["Transaction_ID","Avg_Price","Product_SKU","Product_Description","Product_Category","Month","Coupon_Code","Discount_pct","Transaction_Date","Tenure_Months"],axis=1)

In [7]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,CustomerID,Gender,Location,Quantity,Delivery_Charges,Coupon_Status,GST,Invoice_Value
0,0,12346,F,New York,1,75.0,Used,0.18,91.51174
1,1,12346,F,New York,2,75.0,Used,0.1,83.47


In [8]:
data_final = data.groupby("CustomerID").agg({"Invoice_Value":mean,"Delivery_Charges":mean,"Quantity":mean,"Location":max,"Gender":max,"Coupon_Status":max,"Tenure_Months":max})

In [9]:
data_final["Value"] = df.groupby("CustomerID").sum().apply(value,axis=1)["Value"]

In [10]:
data_final.head()

Unnamed: 0_level_0,Invoice_Value,Delivery_Charges,Quantity,Location,Gender,Coupon_Status,Value
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346,174.98174,150.0,3,New York,F,Used,Low
12347,15686.84396,665.14,342,New York,M,Used,High
12348,1689.55594,197.15,209,California,M,Used,Medium
12350,1467.43528,127.88,21,California,M,Used,Medium
12356,2007.429,637.49,56,Chicago,F,Used,Medium


In [11]:
data_final = pd.get_dummies(data_final,columns = ["Location","Gender","Coupon_Status"])

In [12]:
data_final.columns

Index(['Invoice_Value', 'Delivery_Charges', 'Quantity', 'Value',
       'Location_California', 'Location_Chicago', 'Location_New Jersey',
       'Location_New York', 'Location_Washington DC', 'Gender_F', 'Gender_M',
       'Coupon_Status_Clicked', 'Coupon_Status_Not Used',
       'Coupon_Status_Used'],
      dtype='object')

In [13]:
X = data_final[['Invoice_Value', 'Delivery_Charges', 'Quantity',
       'Location_California', 'Location_Chicago', 'Location_New Jersey',
       'Location_New York', 'Location_Washington DC', 'Gender_F', 'Gender_M',
       'Coupon_Status_Clicked', 'Coupon_Status_Not Used',
       'Coupon_Status_Used']]
Y = data_final["Value"]

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=123)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

log_reg = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
yhat = log_reg.predict(X_test)
print("Train set Accuracy: ", metrics.accuracy_score(y_train, log_reg.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))    

Train set Accuracy:  0.8816013628620102
Test set Accuracy:  0.8231292517006803


In [18]:
print(metrics.classification_report(y_test, yhat))   

              precision    recall  f1-score   support

        High       0.71      1.00      0.83        92
         Low       0.97      0.89      0.93       108
      Medium       0.82      0.57      0.68        94

    accuracy                           0.82       294
   macro avg       0.83      0.82      0.81       294
weighted avg       0.84      0.82      0.82       294

