In [864]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from lazypredict.Supervised import LazyClassifier
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [865]:
data = pd.read_csv('data/train.csv')

In [866]:
data['Product_ID'].value_counts()

Product_ID
104    79887
101    59638
105    35597
99     25816
102    14151
73     10847
74      5898
100     5833
72      5225
127     1509
128     1178
69      1177
129      935
68       830
71       685
126       47
70         9
Name: count, dtype: int64

## Data Preprocessing

In [867]:

# Creating conditional columns based on usage_Type for the 'Volume_KB_SC_Nbr'
data['Volume_KB_Data'] = data.apply(
    lambda x: x['Volume_KB_SC_Nbr'] if x['usage_Type'] == 'DATA' else 0, axis=1)
data['Volume_SC_Voice'] = data.apply(
    lambda x: x['Volume_KB_SC_Nbr'] if x['usage_Type'] == 'VOICE' else 0, axis=1)
data['Nb_Usage_Others'] = data.apply(
    lambda x: x['Volume_KB_SC_Nbr'] if x['usage_Type'] not in ['VOICE', 'DATA'] else 0, axis=1)

In [868]:
# One-hot encoding for 'usage_Type' and 'Destination'
usage_type_dummies = pd.get_dummies(data['usage_Type'], prefix='Type')
destination_dummies = pd.get_dummies(data['Destination'], prefix='Dest')

In [869]:
# Concatenate these new dummy variables with the main dataframe
data = pd.concat([data, usage_type_dummies, destination_dummies], axis=1)

In [870]:
data.columns

Index(['SUBSCRIPTION_DATE', 'subscribers', 'Product_ID', 'NB_SUBSCRIPTION',
       'USAGE_DATE', 'usage_Type', 'Destination', 'Amount_DZD',
       'Amount_data_DZD', 'Volume_Data_KB', 'Volume_KB_SC_Nbr', 'Nb_USAGE',
       'Ines', 'Volume_KB_Data', 'Volume_SC_Voice', 'Nb_Usage_Others',
       'Type_DATA', 'Type_SMS', 'Type_Transfert credit', 'Type_VAS',
       'Type_VOICE', 'Dest_DATA', 'Dest_FIX', 'Dest_Internationnal',
       'Dest_OFF-NET', 'Dest_ON-NET', 'Dest_Others', 'Dest_Transfert credit'],
      dtype='object')

In [871]:
aggregated_data = data.groupby(['Product_ID', 'subscribers', 'SUBSCRIPTION_DATE']).agg({
    'NB_SUBSCRIPTION': 'sum',
    'Amount_DZD': 'sum',
    'Amount_data_DZD': 'sum',
    'Volume_Data_KB': 'sum',
    'Volume_KB_Data': 'sum',
    'Volume_SC_Voice': 'sum',
    'Nb_Usage_Others': 'sum',
    'Nb_USAGE': 'sum',
    **{col: 'sum' for col in usage_type_dummies.columns},
    **{col: 'sum' for col in destination_dummies.columns}
}).reset_index()

In [872]:
aggregated_data['SUBSCRIPTION_DATE'] = pd.to_datetime(
    aggregated_data['SUBSCRIPTION_DATE'])
aggregated_data['Year'] = aggregated_data['SUBSCRIPTION_DATE'].dt.year
aggregated_data['Month'] = aggregated_data['SUBSCRIPTION_DATE'].dt.month
aggregated_data['Day'] = aggregated_data['SUBSCRIPTION_DATE'].dt.day
aggregated_data['DayOfWeek'] = aggregated_data['SUBSCRIPTION_DATE'].dt.dayofweek

# Optionally, create cyclical features for month and day of week
aggregated_data['Month_sin'] = np.sin(2 * np.pi * aggregated_data['Month']/12)
aggregated_data['Month_cos'] = np.cos(2 * np.pi * aggregated_data['Month']/12)
aggregated_data['DayOfWeek_sin'] = np.sin(2 * np.pi * aggregated_data['DayOfWeek']/7)
aggregated_data['DayOfWeek_cos'] = np.cos(2 * np.pi * aggregated_data['DayOfWeek']/7)

aggregated_data.drop(columns=['SUBSCRIPTION_DATE', 'subscribers', 'Month','Day'],inplace=True)

In [873]:
year_dummies = pd.get_dummies(aggregated_data['Year'], prefix='Year')
aggregated_data = pd.concat(
    [aggregated_data, year_dummies], axis=1)
aggregated_data.drop(columns=['Year'],inplace=True)

In [874]:
# aggregated_data.drop(columns=['Amount_data_DZD'], inplace=True)

## Train Test

In [875]:
data = aggregated_data

In [876]:
data.shape

(50775, 28)

In [877]:
X = data.drop('Product_ID', axis=1)
y = data['Product_ID']

In [878]:
# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [879]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [880]:
# Initialize and train the classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42  )
classifier.fit(X_train, y_train)

### train on all the dataset

In [881]:
# param_grid = {
#     'n_estimators': [100],  # Number of trees in the forest
#     # Number of features to consider at every split
#     'max_features': ['auto', 'sqrt'],
#     'max_depth': [None, 10, 20, 30],   # Maximum number of levels in tree
#     # Minimum number of samples required to split a node
#     'min_samples_split': [2, 5],
#     # Minimum number of samples required at each leaf node
#     'min_samples_leaf': [1, 2, 4]
# }

In [882]:
# # Setup the grid search
# grid_search = GridSearchCV(
#     estimator=RandomForestClassifier(random_state=42, class_weight="balanced"),
#     param_grid=param_grid,
#     cv=3,           # Number of folds in cross-validation
#     verbose=2,      # Controls the verbosity: the higher, the more messages
#     n_jobs=-1
# )

In [883]:
# grid_search.fit(X, y)  # Replace X and y with your features and target variable

In [884]:
# classifier = grid_search

classifier = RandomForestClassifier(
    n_estimators=100, max_features='sqrt', random_state=42, min_samples_leaf=1, min_samples_split=2, class_weight="balanced")

In [885]:
# Initialize and train the classifier
classifier.fit(X, y)

In [887]:
y_pred = classifier.predict(X_test)

In [888]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9351058591826686
Classification Report:
               precision    recall  f1-score   support

          68       0.78      1.00      0.88        25
          69       0.82      0.92      0.87        61
          71       0.83      0.97      0.90        31
          72       0.71      0.65      0.68       220
          73       0.87      0.82      0.84       477
          74       0.81      0.77      0.79       226
          99       0.96      0.97      0.96      1146
         100       0.96      1.00      0.98       218
         101       0.99      0.98      0.98      2383
         102       0.95      0.98      0.97       595
         104       0.98      0.92      0.95      3267
         105       0.91      0.95      0.93      1372
         126       0.43      1.00      0.60         3
         127       0.38      0.92      0.54        50
         128       0.42      1.00      0.59        44
         129       0.39      0.92      0.54        37

    accuracy               

In [889]:
data = pd.read_csv('data/TestDJEZZY.csv')

In [890]:

# Creating conditional columns based on usage_Type for the 'Volume_KB_SC_Nbr'
data['Volume_KB_Data'] = data.apply(
    lambda x: x['Volume_KB_SC_Nbr'] if x['usage_Type'] == 'DATA' else 0, axis=1)
data['Volume_SC_Voice'] = data.apply(
    lambda x: x['Volume_KB_SC_Nbr'] if x['usage_Type'] == 'VOICE' else 0, axis=1)
data['Nb_Usage_Others'] = data.apply(
    lambda x: x['Volume_KB_SC_Nbr'] if x['usage_Type'] not in ['VOICE', 'DATA'] else 0, axis=1)

In [891]:
# One-hot encoding for 'usage_Type' and 'Destination'
usage_type_dummies = pd.get_dummies(data['usage_Type'], prefix='Type')
destination_dummies = pd.get_dummies(data['Destination'], prefix='Dest')

In [892]:
# Concatenate these new dummy variables with the main dataframe
data = pd.concat([data, usage_type_dummies, destination_dummies], axis=1)

In [893]:
aggregated_data = data.groupby(['subscribers', 'SUBSCRIPTION_DATE']).agg({
    'NB_SUBSCRIPTION': 'sum',
    'Amount_DZD': 'sum',
    'Amount_data_DZD': 'sum',
    'Volume_Data_KB': 'sum',
    'Volume_KB_Data': 'sum',
    'Volume_SC_Voice': 'sum',
    'Nb_Usage_Others': 'sum',
    'Nb_USAGE': 'sum',
    **{col: 'sum' for col in usage_type_dummies.columns},
    **{col: 'sum' for col in destination_dummies.columns}
}).reset_index()

In [894]:
aggregated_data['SUBSCRIPTION_DATE'] = pd.to_datetime(
    aggregated_data['SUBSCRIPTION_DATE'])
aggregated_data['Year'] = aggregated_data['SUBSCRIPTION_DATE'].dt.year
aggregated_data['Month'] = aggregated_data['SUBSCRIPTION_DATE'].dt.month
aggregated_data['Day'] = aggregated_data['SUBSCRIPTION_DATE'].dt.day
aggregated_data['DayOfWeek'] = aggregated_data['SUBSCRIPTION_DATE'].dt.dayofweek

# Optionally, create cyclical features for month and day of week
aggregated_data['Month_sin'] = np.sin(2 * np.pi * aggregated_data['Month']/12)
aggregated_data['Month_cos'] = np.cos(2 * np.pi * aggregated_data['Month']/12)
aggregated_data['DayOfWeek_sin'] = np.sin(
    2 * np.pi * aggregated_data['DayOfWeek']/7)
aggregated_data['DayOfWeek_cos'] = np.cos(
    2 * np.pi * aggregated_data['DayOfWeek']/7)

save = aggregated_data.copy()
aggregated_data.drop(columns=['SUBSCRIPTION_DATE', 'subscribers','Month','Day' ], inplace=True)

year_dummies = pd.get_dummies(aggregated_data['Year'], prefix='Year')
aggregated_data = pd.concat(
    [aggregated_data, year_dummies], axis=1)
aggregated_data.drop(columns=['Year'], inplace=True)

In [895]:
X_f = aggregated_data
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_f)



In [896]:
y_pred = classifier.predict(X_scaled)

In [897]:
result_df = pd.DataFrame(
    {'subscribers': save['subscribers'], 'Product_ID': y_pred})

In [898]:
result_df['Product_ID'].value_counts()

Product_ID
101    726
104    508
99     198
105     25
102      9
100      2
73       1
Name: count, dtype: int64

In [832]:
len(result_df['subscribers'].unique())

1469

In [956]:
# Group by 'subscriber_id' and 'Product_ID' and count occurrences
product_counts = data.groupby(
    ['subscribers', 'Product_ID', 'usage_Type', 'SUBSCRIPTION_DATE']).size().reset_index(name='counts')


def get_weight(usage_type):
    if usage_type == 'DATA':
        return 2
    elif usage_type == 'VOICE' :
        return 1
    else:
        return 0
    

product_counts['sort_weight'] = product_counts['usage_Type'].apply(get_weight)
# Sort the data by 'subscriber_id' and 'counts' to put the highest counts at the top for each subscriber
# Sort the data by 'subscribers', 'sort_weight' (descending), and 'counts' (descending)
product_counts['SUBSCRIPTION_DATE'] = pd.to_datetime(
    product_counts['SUBSCRIPTION_DATE'])
product_counts_sorted = product_counts.sort_values(
    by=['subscribers', 'sort_weight', 'SUBSCRIPTION_DATE', 'counts'], ascending=[True, False, False,False])

# Drop duplicates to keep only the top (most frequent product) for each subscriber
most_frequent_products = product_counts_sorted.drop_duplicates(
    subset='subscribers', keep='first')