In [10]:
import pandas as pd

In [11]:
churn_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/churn.csv.txt', parse_dates=['last_trip_date','signup_date'])

### is there any churn in the data
* Many times target is not directly given 
* It has to be derived from the feature columns 
* From the data,we need to identify the data on which the data was downloaded 

In [12]:
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4


In [13]:
churn_data.last_trip_date.max()

Timestamp('2014-07-01 00:00:00')

In [14]:
last_date = churn_data.last_trip_date.max()

In [15]:
import datetime

In [16]:
cutoff_date = last_date-datetime.timedelta(30)

* If an user did't come after date he/she is considered as churn 

In [17]:
churn_data['churn'] = churn_data.last_trip_date.map(lambda d: 'Not churn' if d > cutoff_date else 'churn')

In [18]:
churn_data.sample()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,churn
34198,5.22,4.9,4.8,1.04,Winterfell,2014-06-11,iPhone,2014-01-16,3.8,1,True,84.6,Not churn


In [19]:
churn_data.dtypes

avg_dist                         float64
avg_rating_by_driver             float64
avg_rating_of_driver             float64
avg_surge                        float64
city                              object
last_trip_date            datetime64[ns]
phone                             object
signup_date               datetime64[ns]
surge_pct                        float64
trips_in_first_30_days             int64
luxury_car_user                     bool
weekday_pct                      float64
churn                             object
dtype: object

In [20]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   avg_dist                50000 non-null  float64       
 1   avg_rating_by_driver    49799 non-null  float64       
 2   avg_rating_of_driver    41878 non-null  float64       
 3   avg_surge               50000 non-null  float64       
 4   city                    50000 non-null  object        
 5   last_trip_date          50000 non-null  datetime64[ns]
 6   phone                   49604 non-null  object        
 7   signup_date             50000 non-null  datetime64[ns]
 8   surge_pct               50000 non-null  float64       
 9   trips_in_first_30_days  50000 non-null  int64         
 10  luxury_car_user         50000 non-null  bool          
 11  weekday_pct             50000 non-null  float64       
 12  churn                   50000 non-null  object

In [21]:
float_churn_data = churn_data.select_dtypes(include = ['float','bool'])

In [22]:
float_churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   avg_dist              50000 non-null  float64
 1   avg_rating_by_driver  49799 non-null  float64
 2   avg_rating_of_driver  41878 non-null  float64
 3   avg_surge             50000 non-null  float64
 4   surge_pct             50000 non-null  float64
 5   luxury_car_user       50000 non-null  bool   
 6   weekday_pct           50000 non-null  float64
dtypes: bool(1), float64(6)
memory usage: 2.3 MB


In [25]:
cat_churn_data = churn_data[['city','phone']]

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder

In [27]:
float_pipeline = make_pipeline(SimpleImputer(strategy='median'),MinMaxScaler())

In [28]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),OrdinalEncoder())

In [29]:
churn_data['subscription_days'] = churn_data.last_trip_date - churn_data.signup_date

* timedelta to days conversion

In [31]:
churn_data['subscription_days']

0       143 days
1        96 days
2         1 days
3       170 days
4        47 days
          ...   
49995   131 days
49996     1 days
49997   111 days
49998     1 days
49999    92 days
Name: subscription_days, Length: 50000, dtype: timedelta64[ns]

In [32]:
churn_data['subscription_days'] = churn_data.subscription_days.dt.days

In [33]:
churn_data['subscription_days']

0        143
1         96
2          1
3        170
4         47
        ... 
49995    131
49996      1
49997    111
49998      1
49999     92
Name: subscription_days, Length: 50000, dtype: int64

In [34]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   avg_dist                50000 non-null  float64       
 1   avg_rating_by_driver    49799 non-null  float64       
 2   avg_rating_of_driver    41878 non-null  float64       
 3   avg_surge               50000 non-null  float64       
 4   city                    50000 non-null  object        
 5   last_trip_date          50000 non-null  datetime64[ns]
 6   phone                   49604 non-null  object        
 7   signup_date             50000 non-null  datetime64[ns]
 8   surge_pct               50000 non-null  float64       
 9   trips_in_first_30_days  50000 non-null  int64         
 10  luxury_car_user         50000 non-null  bool          
 11  weekday_pct             50000 non-null  float64       
 12  churn                   50000 non-null  object

In [35]:
int_churn_data = churn_data.select_dtypes(include=['int'])

In [36]:
int_churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Empty DataFrame

In [37]:
int_pipeline = make_pipeline(MinMaxScaler())

In [38]:
from sklearn.compose import make_column_transformer

In [39]:
preprocessor = make_column_transformer(
    (int_pipeline,int_churn_data.columns),
    (cat_pipeline,cat_churn_data.columns),
    (float_pipeline,float_churn_data.columns),
)

In [40]:
churn_data.churn.value_counts()

churn        31690
Not churn    18310
Name: churn, dtype: int64

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [43]:
pipeline = make_pipeline(preprocessor,RandomForestClassifier())

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
trainX,testX,trainY,testY = train_test_split(churn_data.drop(columns=['churn']), churn_data.churn)

In [46]:
pipeline.fit(trainX,trainY)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('minmaxscaler',
                                                                   MinMaxScaler())]),
                                                  Index([], dtype='object')),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ordinalencoder',
                                                                   OrdinalEncoder())]),
                                                  Index(['city', 'phone'], dtype='object')),
                                                 ('pipeline-3',
                                                  Pipeline(steps

In [47]:
pipeline.score(testX,testY)

0.74752

In [48]:
from sklearn.metrics import f1_score,confusion_matrix

In [49]:
y_pred = pipeline.predict(testX)

In [50]:
y_pred

array(['Not churn', 'churn', 'Not churn', ..., 'churn', 'churn',
       'Not churn'], dtype=object)

In [51]:
y_true = testY

In [52]:
y_true

30987    Not churn
19911        churn
2477         churn
35341        churn
3932     Not churn
           ...    
8727         churn
26178    Not churn
41433        churn
3432         churn
49487    Not churn
Name: churn, Length: 12500, dtype: object

In [53]:
confusion_matrix(y_pred=y_pred,y_true=y_true)

array([[2894, 1616],
       [1540, 6450]], dtype=int64)

In [54]:
from sklearn.model_selection import GridSearchCV

In [59]:
gs = GridSearchCV(pipeline,param_grid={'randomforestclassifier__n_estimators':[300,400,500]},cv= 5,n_jobs=4)

In [60]:
gs.fit(trainX,trainY)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('minmaxscaler',
                                                                                          MinMaxScaler())]),
                                                                         Index([], dtype='object')),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ordinalencoder',
                                                                                   

In [61]:
gs.best_score_

0.7436799999999999

In [62]:
gs.best_params_

{'randomforestclassifier__n_estimators': 300}

In [63]:
pipeline = make_pipeline(preprocessor,GaussianNB())

In [64]:
pipeline.fit(trainX, trainY)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('minmaxscaler',
                                                                   MinMaxScaler())]),
                                                  Index([], dtype='object')),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ordinalencoder',
                                                                   OrdinalEncoder())]),
                                                  Index(['city', 'phone'], dtype='object')),
                                                 ('pipeline-3',
                                                  Pipeline(steps

In [65]:
pipeline.score(testX,testY)

0.64512

In [66]:
for name,estimator in zip(['RandomForest','GaussianNB','LogisticReg'],[RandomForestClassifier(n_estimators=100),GaussianNB(),LogisticRegression()]):
    pipeline = make_pipeline(preprocessor,estimator)
    pipeline.fit(trainX,trainY)
    print(name,pipeline.score(testX,testY))

RandomForest 0.7508
GaussianNB 0.64512
LogisticReg 0.6836


## Further Fine tuning 
* Feature Selection
* Balancing Data 
* More Hyper-parameter tuning 
* Consider month column from date-time cols 