In [1]:
pip install imbalanced-learn



In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
#Online Retail Dataset from UCI Machine Learning Repository
df= pd.read_csv('Online Retail.csv', encoding='latin1')

In [4]:
#drop missing values
df = df.dropna()
df_online_retail = df.copy()

# Calculate the total amount
df_online_retail['Total'] = df_online_retail['Quantity'] * df_online_retail['UnitPrice']
df_online_retail

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Total
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01/12/2010 08:26,2.55,17850.0,United Kingdom,15.30
1,536365,71053,WHITE METAL LANTERN,6,01/12/2010 08:26,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01/12/2010 08:26,2.75,17850.0,United Kingdom,22.00
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01/12/2010 08:26,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01/12/2010 08:26,3.39,17850.0,United Kingdom,20.34
...,...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,09/12/2011 12:50,0.85,12680.0,France,10.20
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,09/12/2011 12:50,2.10,12680.0,France,12.60
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,09/12/2011 12:50,4.15,12680.0,France,16.60
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,09/12/2011 12:50,4.15,12680.0,France,16.60


In [5]:
# Handle inconsistent datetime
df_online_retail['InvoiceDate'] = pd.to_datetime(df_online_retail['InvoiceDate'], errors='coerce')

# Group by customer ID
online_customer_summary = df_online_retail.groupby('CustomerID').agg({
    'InvoiceNo': 'count',
    'Total': 'sum',
    'InvoiceDate': lambda x: (x.max() - x.min()).days
}).reset_index()

online_customer_summary

Unnamed: 0,CustomerID,InvoiceNo,Total,InvoiceDate
0,12346.0,2,0.00,
1,12347.0,182,4310.00,420.0
2,12348.0,31,1797.24,0.0
3,12349.0,73,1757.55,
4,12350.0,17,334.40,0.0
...,...,...,...,...
4367,18280.0,10,180.60,0.0
4368,18281.0,7,80.82,0.0
4369,18282.0,13,176.60,208.0
4370,18283.0,756,2094.88,155.0


In [6]:
# Filter valid entries and set the threshold for frequent buyers
online_customer_summary = online_customer_summary[online_customer_summary['InvoiceDate'] > 0]
online_customer_summary

Unnamed: 0,CustomerID,InvoiceNo,Total,InvoiceDate
1,12347.0,182,4310.00,420.0
5,12352.0,95,1545.41,66.0
11,12358.0,19,1168.06,116.0
12,12359.0,254,6245.53,292.0
15,12362.0,274,5154.58,213.0
...,...,...,...,...
4356,18263.0,62,1211.08,183.0
4361,18272.0,170,3064.78,156.0
4362,18273.0,3,204.00,64.0
4369,18282.0,13,176.60,208.0


In [7]:
#copy of the dataframe
online_retail_summary = online_customer_summary.copy()

threshold =100
online_retail_summary["Frequent_Byer"] = (online_retail_summary['InvoiceNo']>threshold).astype(int)

#Scale invoice and total using minmax scaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(online_retail_summary[['InvoiceNo', 'Total']])

#Train-test split
x_train,x_test,y_train,y_test = train_test_split(scaled_data,online_retail_summary['Frequent_Byer'],test_size=0.2,random_state=42)


In [8]:
#SMOTE for resampling
smote = SMOTE(random_state=42)
x_res,y_res = smote.fit_resample(x_train,y_train)
x_res.shape[0]

1356

In [9]:
#Parameters for GridSearchCV
param = {
    'C':[0.1,1,10,100],
    'kernel':['linear','rbf'],
    'gamma':['scale','auto']
}

svc=SVC()
grid_search=GridSearchCV(estimator=svc,param_grid=param,scoring='accuracy',cv=5)
grid_search.fit(x_res,y_res)

#Print the best parameters
print(f"best parameters{grid_search.best_params_}")
print(f"Best cross validation score{grid_search.best_score_}")

#Test with the new model
new_model = grid_search.best_estimator_
y_pred = new_model.predict(x_test)

#Test accuracy
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy {accuracy}")
print(f"Confusion Matrix: \n {confusion_matrix(y_test,y_pred)}")


best parameters{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross validation score1.0
Accuracy 0.9969879518072289
Confusion Matrix: 
 [[166   1]
 [  0 165]]
