In [1]:
# Getting data ready
import pandas as pd
import warnings
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
# Ignore warnings
warnings.filterwarnings('ignore')

#### LOAD DATA

In [3]:
DATA_URL: str = "./Expresso_churn_dataset.csv"

In [4]:
df = pd.read_csv(DATA_URL)

In [5]:
df.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,K > 24 month,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,NO,54,On net 200F=Unlimited _call24H,8.0,0
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,,I 18-21 month,,,,,,,,,,,,NO,4,,,1
2,00001654a9d9f96303d9969d0a4a851714a4bb57,,K > 24 month,3600.0,2.0,1020.0,340.0,2.0,,90.0,46.0,7.0,,,NO,17,On-net 1000F=10MilF;10d,1.0,0
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,K > 24 month,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,,,NO,62,"Data:1000F=5GB,7d",11.0,0
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,K > 24 month,1000.0,1.0,985.0,328.0,1.0,,39.0,24.0,,,,NO,11,Mixt 250F=Unlimited_call24H,2.0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154048 entries, 0 to 2154047
Data columns (total 19 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         object 
 1   REGION          object 
 2   TENURE          object 
 3   MONTANT         float64
 4   FREQUENCE_RECH  float64
 5   REVENUE         float64
 6   ARPU_SEGMENT    float64
 7   FREQUENCE       float64
 8   DATA_VOLUME     float64
 9   ON_NET          float64
 10  ORANGE          float64
 11  TIGO            float64
 12  ZONE1           float64
 13  ZONE2           float64
 14  MRG             object 
 15  REGULARITY      int64  
 16  TOP_PACK        object 
 17  FREQ_TOP_PACK   float64
 18  CHURN           int64  
dtypes: float64(12), int64(2), object(5)
memory usage: 312.2+ MB


In [7]:
df.shape

(2154048, 19)

In [8]:
# Copy data
data = df.copy()

#### REDUCE DATA

In [9]:
# drop unique identifier `user_id`
data.drop("user_id", axis=1, inplace=True)
data.shape

(2154048, 18)

In [10]:
# drop dupliate rows
data.drop_duplicates(inplace=True)
data.shape

(1509941, 18)

#### SPLIT DATA

In [11]:
# Split data into Train and Test Sets
from sklearn.model_selection import train_test_split

# Split into X & y
x = data.drop("CHURN", axis=1)
y = data["CHURN"]

# Split data into train and test
np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [12]:
x_train.head()

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK
212196,DAKAR,K > 24 month,200.0,1.0,200.0,67.0,1.0,58.0,,,,,,NO,22,"Data: 200 F=100MB,24H",1.0
2125227,KOLDA,K > 24 month,200.0,1.0,202.0,67.0,2.0,,85.0,8.0,,,,NO,16,On net 200F=Unlimited _call24H,1.0
1241091,DAKAR,K > 24 month,82614.0,13.0,4003.0,1334.0,32.0,,434.0,1089.0,73.0,,2.0,NO,60,,
1170969,LOUGA,K > 24 month,1500.0,3.0,1492.0,497.0,5.0,243.0,29.0,25.0,0.0,,,NO,30,All-net 500F=2000F;5d,2.0
1592172,DAKAR,K > 24 month,1700.0,3.0,1700.0,567.0,4.0,2016.0,0.0,4.0,1.0,,,NO,15,VAS(IVR_Radio_Daily),2.0


#### HANDLE MISSING DATA

In [13]:
# Count Missing Values in Training Data Set
x_train.isna().sum()

REGION             275900
TENURE                  0
MONTANT            105369
FREQUENCE_RECH     105369
REVENUE             80207
ARPU_SEGMENT        80207
FREQUENCE           80207
DATA_VOLUME        391857
ON_NET             152356
ORANGE             203809
TIGO               517004
ZONE1             1073206
ZONE2             1098618
MRG                     0
REGULARITY              0
TOP_PACK           208209
FREQ_TOP_PACK      208209
dtype: int64

In [14]:
num_cols = x.select_dtypes(include=[np.number]).columns
num_cols

Index(['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE',
       'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2',
       'REGULARITY', 'FREQ_TOP_PACK'],
      dtype='object')

In [15]:
cat_cols = x.select_dtypes(include=[np.object]).columns
cat_cols

Index(['REGION', 'TENURE', 'MRG', 'TOP_PACK'], dtype='object')

In [16]:
# Fill Nan Using Sklearn
cat_imputer = Pipeline(
    steps = [
                ("imputer", SimpleImputer(missing_values=np.nan, strategy="most_frequent"))
        ])

num_imputer = Pipeline(
    steps = [
                ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
                ('scaler', StandardScaler())
    ])

In [17]:
# Create Transformer
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_cols),
    ("num_imputer", num_imputer, num_cols)
])

In [18]:
# Fill x_train
filled_x_train = imputer.fit_transform(x_train)
x_train_f = pd.DataFrame(filled_x_train, columns=cat_cols.to_list() + num_cols.to_list())
x_train_f.isna().sum()


REGION            0
TENURE            0
MRG               0
TOP_PACK          0
MONTANT           0
FREQUENCE_RECH    0
REVENUE           0
ARPU_SEGMENT      0
FREQUENCE         0
DATA_VOLUME       0
ON_NET            0
ORANGE            0
TIGO              0
ZONE1             0
ZONE2             0
REGULARITY        0
FREQ_TOP_PACK     0
dtype: int64

In [19]:
label_enc = LabelEncoder()
x_train_f[cat_cols] = x_train_f[cat_cols].apply(label_enc.fit_transform)
x_train_f

Unnamed: 0,REGION,TENURE,MRG,TOP_PACK,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,FREQ_TOP_PACK
0,0,7,0,30,-0.793511,-0.839728,-0.773368,-0.773229,-0.924492,-0.313604,0.0,0.0,-0.0,0.0,0.0,-0.762226,-0.740806
1,6,7,0,102,-0.793511,-0.839728,-0.773081,-0.773229,-0.854146,-0.0,-0.243953,-0.471296,-0.0,0.0,0.0,-1.064265,-0.740806
2,0,7,0,17,11.309059,0.104389,-0.226731,-0.226878,1.256237,-0.0,0.176853,5.341338,1.04007,0.0,-0.553487,1.150684,0.0
3,7,7,0,17,-0.602605,-0.682375,-0.587658,-0.587806,-0.643108,-0.297255,-0.311475,-0.379886,-0.480876,0.0,0.0,-0.359508,-0.651339
4,0,7,0,119,-0.573235,-0.682375,-0.557761,-0.55762,-0.713454,-0.140563,-0.346442,-0.492804,-0.460042,0.0,0.0,-1.114605,-0.651339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1207947,0,7,0,17,-0.778826,-0.761052,-0.715729,-0.715877,-0.713454,-0.0,-0.346442,-0.492804,-0.0,0.0,0.0,-1.164944,0.0
1207948,2,7,0,17,1.08618,1.048507,1.094944,1.0948,0.904506,-0.31873,-0.123378,0.227726,2.706861,0.0,0.0,1.150684,1.40642
1207949,1,7,0,17,-0.793511,-0.839728,-0.772793,-0.772797,-0.924492,-0.0,0.0,0.0,-0.439207,0.0,0.0,0.596947,0.0
1207950,9,7,0,30,0.30787,1.048507,0.304958,0.304812,1.115545,0.689556,-0.328356,-0.503559,-0.460042,0.0,0.0,1.100345,0.780146


In [20]:
#Fill x_test
filled_x_test = imputer.fit_transform(x_test)
x_test_f = pd.DataFrame(filled_x_test, columns=cat_cols.to_list() + num_cols.to_list())

x_test_f.isna().sum()

REGION            0
TENURE            0
MRG               0
TOP_PACK          0
MONTANT           0
FREQUENCE_RECH    0
REVENUE           0
ARPU_SEGMENT      0
FREQUENCE         0
DATA_VOLUME       0
ON_NET            0
ORANGE            0
TIGO              0
ZONE1             0
ZONE2             0
REGULARITY        0
FREQ_TOP_PACK     0
dtype: int64

In [21]:
x_test_f[cat_cols] = x_test_f[cat_cols].apply(label_enc.fit_transform)

In [22]:
x_test_f

Unnamed: 0,REGION,TENURE,MRG,TOP_PACK,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,FREQ_TOP_PACK
0,0,7,0,12,0.0,0.0,-0.798427,-0.798574,-0.923931,-0.0,-0.287516,-0.390308,-0.47982,0.0,0.0,-0.308658,0.0
1,11,7,0,21,-0.249253,0.968021,-0.183975,-0.183978,0.691996,-0.00972,-0.337066,-0.468596,-0.0,-0.669814,0.0,0.1945,1.04706
2,2,2,0,21,-0.117666,0.968021,-0.146206,-0.146209,0.973027,-0.068777,-0.346734,-0.458158,-0.0,0.0,0.0,0.697658,0.510048
3,0,7,0,35,-0.746358,-0.838862,-0.72847,-0.728617,-0.923931,-0.256102,0.0,0.0,-0.0,0.0,0.0,-0.962763,-0.74298
4,1,7,0,12,-0.804841,-0.838862,-0.762805,-0.762952,-0.853673,-0.0,0.0,-0.489473,-0.0,0.0,0.0,-1.214342,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301984,0,7,0,81,-0.541668,-0.446061,-0.527324,-0.527328,-0.572642,-0.31979,0.207984,-0.181538,-0.0,0.0,0.0,-0.006763,-0.205968
301985,9,7,0,12,0.0,0.0,-0.770244,-0.770248,-0.853673,-0.0,0.0,-0.489473,-0.0,0.0,0.0,-1.616868,0.0
301986,0,7,0,35,1.285926,1.675063,1.28828,1.288135,1.254057,1.817834,-0.3419,-0.353773,-0.438564,0.0,-0.350289,0.295132,1.315566
301987,0,7,0,12,0.0,0.0,0.0,0.0,0.0,-0.137186,-0.343109,0.0,-0.0,0.0,0.0,-1.566552,0.0


#### OUTLIERS REMOVAL

In [23]:
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

In [25]:
# fig = make_subplots(rows=4, cols=4)
# fig.print_grid()
# row, col = (1, 1)
# for idx, column in enumerate(num_cols.to_list()):
#   print(row, col)
#   fig.add_trace(
#       go.Box(y=data[column], name=column.replace("_", " ").title()),
#       row=row, col=col
#     )
#   col += 1
#   if col == 5:
#     row += 1
#     col = 1
  
# fig.update_layout(title_text="Numerical Features Box Plots")
# fig.show()

In [26]:
x_train_f[num_cols] = x_train_f[num_cols][
    (np.abs(stats.zscore(x_train_f[num_cols])) < 3).all(axis=1)
]

TypeError: loop of ufunc does not support argument 0 of type float which has no callable sqrt method

#### TRAIN & TEST MODEL

In [None]:
# Make & fit baseline model
clf = RandomForestClassifier()
clf.fit(x_train_f, y_train)

In [None]:
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_pred labels
    on a classification.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metric_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")
    
    return metric_dict

In [None]:
# Make baseline predictions
y_preds = clf.predict(x_test_f)

# Evaluate the classifier on validation set
baseline_metrics = evaluate_preds(y_test, y_preds)