### IMPORT 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import numpy as np

#### Note
The segmentation of this dataset is called "rank", its based on RFM score (R * 100 + F * 10 + M):
- **Platinum Customer**     - rank 1 - rfm_score >= 444
- **Very Loyal**            - rank 2 - rfm_score >=433 and rfm_score < 444
- **Becoming Loyal**        - rank 3 - rfm_score >=421 and rfm_score < 433
- **Recent Customer**       - rank 4 - rfm_score >=344 and rfm_score < 421
- **Potential Customer**    - rank 5 - rfm_score >=323 and rfm_score < 344
- **Late Bloomer**          - rank 6 - rfm_score >=311 and rfm_score < 323
- **Loosing Customer**      - rank 7 - rfm_score >=224 and rfm_score < 311
- **High Risk Customer**    - rank 8 - rfm_score >=212 and rfm_score < 224
- **Almost Lost Customer**  - rank 9 - rfm_score >=124 and rfm_score < 212
- **Evasive Customer**      - rank 10 - rfm_score >=112 and rfm_score < 124
- **Lost Customer**         - rank 11 - rfm_score < 112

In [2]:
dataset_filename = "data/Customer_Trans_RFM_Analysis.csv"

In [3]:
dataset = pd.read_csv(dataset_filename)

### DATA EXPLORATION?

In [4]:
dataset.shape

(19354, 36)

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19354 entries, 0 to 19353
Data columns (total 36 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   transaction_id                       19354 non-null  int64  
 1   product_id                           19354 non-null  int64  
 2   customer_id                          19354 non-null  int64  
 3   transaction_date                     19354 non-null  object 
 4   online_order                         19354 non-null  float64
 5   order_status                         19354 non-null  object 
 6   brand                                19354 non-null  object 
 7   product_line                         19354 non-null  object 
 8   product_class                        19354 non-null  object 
 9   product_size                         19354 non-null  object 
 10  list_price                           19354 non-null  float64
 11  standard_cost               

In [7]:
dataset.keys()

Index(['transaction_id', 'product_id', 'customer_id', 'transaction_date',
       'online_order', 'order_status', 'brand', 'product_line',
       'product_class', 'product_size', 'list_price', 'standard_cost',
       'product_first_sold_date', 'Profit', 'first_name', 'last_name',
       'gender', 'past_3_years_bike_related_purchases', 'DOB', 'job_title',
       'job_industry_category', 'wealth_segment', 'deceased_indicator',
       'owns_car', 'tenure', 'Age', 'recency', 'frequency', 'monetary',
       'r_quartile', 'f_quartile', 'm_quartile', 'rfm_score',
       'detail_cust_title', 'rank', 'Age_Group'],
      dtype='object')

In [6]:
dataset.describe()

Unnamed: 0,transaction_id,product_id,customer_id,online_order,list_price,standard_cost,product_first_sold_date,Profit,past_3_years_bike_related_purchases,tenure,Age,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,rank,Age_Group
count,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0,19354.0
mean,9990.831353,45.800558,1738.379043,0.509559,1106.496326,555.726808,38203.330836,550.769518,48.952723,10.681823,44.4164,52.595381,6.607936,3636.377479,2.65821,2.632427,2.804743,294.950036,5.790018,49.870828
std,5773.155488,30.558698,1011.093965,0.499922,582.808381,405.72699,2873.922638,492.941763,28.632358,5.674004,12.610238,49.840399,2.321842,1839.247231,1.089212,1.116134,1.079133,112.660997,2.884116,12.656456
min,1.0,0.0,1.0,0.0,12.01,7.21,33259.0,4.8,0.0,1.0,20.0,0.0,1.0,15.08,1.0,1.0,1.0,111.0,1.0,30.0
25%,4996.25,18.0,856.25,0.0,575.27,215.14,35667.0,133.78,24.0,6.0,35.0,16.0,5.0,2304.6125,2.0,2.0,2.0,222.0,3.0,40.0
50%,9988.5,45.0,1735.0,1.0,1163.89,507.58,38216.0,445.21,48.0,11.0,44.0,38.0,6.0,3420.13,3.0,2.0,3.0,322.0,6.0,50.0
75%,14988.75,72.0,2614.0,1.0,1635.3,795.1,40672.0,827.16,73.0,15.0,54.0,74.0,8.0,4757.96,4.0,4.0,4.0,421.0,8.0,60.0
max,20000.0,100.0,3500.0,1.0,2091.47,1759.85,42710.0,1702.55,99.0,22.0,90.0,353.0,14.0,11668.95,4.0,4.0,4.0,444.0,11.0,100.0


### PREPROCESSING
- brand
- product line
- product class
- product size
- gender
- own cars

In [17]:
dataset['brand'].value_counts()

brand
Solex             4175
Giant Bicycles    3240
WeareA2B          3208
OHM Cycles        2979
Trek Bicycles     2921
Norco Bicycles    2831
Name: count, dtype: int64

In [18]:
dataset['product_line'].value_counts()

product_line
Standard    13870
Road         3876
Touring      1193
Mountain      415
Name: count, dtype: int64

In [20]:
dataset['product_class'].value_counts()

product_class
medium    13498
high       2946
low        2910
Name: count, dtype: int64

In [21]:
dataset['product_size'].value_counts()

product_size
medium    12703
large      3886
small      2765
Name: count, dtype: int64

In [27]:
# mapping features to numbers

# mapping brand names to numbers
dataset['brand'] = dataset['brand'].map({'Solex': 1, 'Giant Bicycles': 2, 'WeareA2B': 3, 
                                    'OHM Cycles': 4, 'Trek Bicycles':5, 'Norco Bicycles':6}).astype(int)

# mapping product line names to numbers
dataset['product_line'] = dataset['product_line'].map({'Mountain': 1, 'Road': 2, 'Touring': 3, 'Standard':4}).astype(int)

# mapping product class names to numbers
dataset['product_class'] = dataset['product_class'].map({'low': 1, 'medium': 2, 'high':3}).astype(int)

# mapping product size names to numbers
dataset['product_size'] = dataset['product_size'].map({'small': 1, 'medium': 2, 'large':3}).astype(int)

# mapping gender to numbers
dataset['gender'] = dataset['gender'].map({'Male': 0, 'Female': 1}).astype(int)

# mapping owns_car to numbers
dataset['owns_car'] = dataset['owns_car'].map({'Yes': 1, 'No': 0}).astype(int)

    

### TRAINSET / TESTSET

In [None]:
# labels are ranks
drop_elements = ['transaction_id','customer_id','transaction_date','oder_status', 'wealth_segment',
                 'first_name','last_name','DOB','job_title','job_industry','deceased','detai_cust_title']
# drop non-numeric columns
x = dataset.drop(drop_elements, axis=1)
y = dataset['rank']

In [12]:
# split 80/20 training set and testing set
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, y_train.size)
print(x_test.shape, y_test.size)


(15483, 33) 15483
(3871, 33) 3871


### RANDOM FOREEST

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Evaluate Random Forest with different numbers of trees
tree_counts = [10, 50, 100, 200, 300, 400, 500]
accuracies = []

for n_trees in tree_counts:
    rf = RandomForestClassifier(n_estimators=n_trees, random_state=42)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

# Find the optimal number of trees
optimal_trees = tree_counts[accuracies.index(max(accuracies))]
print(f"Optimal number of trees: {optimal_trees}")

# Plot the results
import matplotlib.pyplot as plt

plt.plot(tree_counts, accuracies, marker='o')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Number of Trees in Random Forest')
plt.show()

ValueError: could not convert string to float: 'Approved'

In [None]:
# create a random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()