In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [2]:
import openpyxl
dfs = pd.read_excel('./online_retail_II.xlsx', sheet_name=None)

In [3]:
print(dfs.keys())

dict_keys(['Year 2009-2010', 'Year 2010-2011'])


In [4]:
df1 = dfs['Year 2010-2011']

In [5]:
df1.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [6]:
df1.describe()

Unnamed: 0,Quantity,InvoiceDate,Price,Customer ID
count,541910.0,541910,541910.0,406830.0
mean,9.552234,2011-07-04 13:35:22.342307584,4.611138,15287.68416
min,-80995.0,2010-12-01 08:26:00,-11062.06,12346.0
25%,1.0,2011-03-28 11:34:00,1.25,13953.0
50%,3.0,2011-07-19 17:17:00,2.08,15152.0
75%,10.0,2011-10-19 11:27:00,4.13,16791.0
max,80995.0,2011-12-09 12:50:00,38970.0,18287.0
std,218.080957,,96.759765,1713.603074


In [7]:
df1.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Quantity,541910.0,9.552234,-80995.0,1.0,3.0,10.0,80995.0,218.080957
InvoiceDate,541910.0,2011-07-04 13:35:22.342307584,2010-12-01 08:26:00,2011-03-28 11:34:00,2011-07-19 17:17:00,2011-10-19 11:27:00,2011-12-09 12:50:00,
Price,541910.0,4.611138,-11062.06,1.25,2.08,4.13,38970.0,96.759765
Customer ID,406830.0,15287.68416,12346.0,13953.0,15152.0,16791.0,18287.0,1713.603074


In [8]:
correlation_matrix = df1.select_dtypes(include=['number']).corr()  
print(correlation_matrix)

             Quantity     Price  Customer ID
Quantity     1.000000 -0.001235     -0.00360
Price       -0.001235  1.000000     -0.00456
Customer ID -0.003600 -0.004560      1.00000


# Classification

In [9]:
label_encoder = LabelEncoder()

# Encode non-numeric columns for demonstration
for col in ['Invoice', 'StockCode', 'Description', 'Country', 'InvoiceDate']:
    df1[col] = label_encoder.fit_transform(df1[col].astype(str))

In [10]:
df1 = df1.dropna()

In [11]:
# Raw
X = df1.drop('Quantity', axis=1)  
y = df1['Quantity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(284781, 7) (122049, 7) (284781,) (122049,)


In [13]:
rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)

In [14]:
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5125154651000827


## Parameter tunning

In [15]:
#Define parameter grid
param_grid = {
    'n_estimators': [10],
    'max_depth': [5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

#Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

#Use GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

#Fit the model
grid_search.fit(X_train, y_train)

#Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits




Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 10}
Best Score: 0.29461236876888985


## With SMOTE

In [19]:
print(y_train.value_counts())

Quantity
 1      51297
 12     41978
 2      40578
 6      26376
 4      22529
        ...  
 109        1
-45         1
 348        1
-960        1
 87         1
Name: count, Length: 381, dtype: int64


In [29]:
smote = SMOTE(random_state=42, k_neighbors=1)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)  

# Train Random Forest
model = RandomForestClassifier(random_state=42)
model.fit(X_resampled, y_resampled)

# Predictions
y_pred = model.predict(X_test)

# # Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 2, n_samples_fit = 1, n_samples = 1

## With SMOTE and param tuning

In [27]:
#Define parameter grid
param_grid = {
    'n_estimators': [10],
    'max_depth': [5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

#Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

#Use GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

#Fit the model
grid_search.fit(X_resampled, y_resampled)

#Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

NameError: name 'X_resampled' is not defined