In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

In [3]:
df = pd.read_csv('/data/lens_demo_feature_engineering_2022_01_22_11_25_42_061.csv')

In [4]:
df.shape

(7043, 20)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   seniorcitizen     7043 non-null   int64  
 2   partner           7043 non-null   object 
 3   dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   phoneservice      7043 non-null   object 
 6   multiplelines     7043 non-null   object 
 7   internetservice   7043 non-null   object 
 8   onlinesecurity    7043 non-null   object 
 9   onlinebackup      7043 non-null   object 
 10  deviceprotection  7043 non-null   object 
 11  techsupport       7043 non-null   object 
 12  streamingtv       7043 non-null   object 
 13  streamingmovies   7043 non-null   object 
 14  contract          7043 non-null   object 
 15  paperlessbilling  7043 non-null   object 
 16  paymentmethod     7043 non-null   object 


In [7]:
df.nunique()

gender                 2
seniorcitizen          2
partner                2
dependents             2
tenure                73
phoneservice           2
multiplelines          3
internetservice        3
onlinesecurity         3
onlinebackup           3
deviceprotection       3
techsupport            3
streamingtv            3
streamingmovies        3
contract               3
paperlessbilling       2
paymentmethod          4
monthlycharges      1585
totalcharges        3846
churn                  2
dtype: int64

In [8]:
df.isnull().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [12]:
df.loc[:, 'churn'].value_counts()

No     5174
Yes    1869
Name: churn, dtype: int64

In [15]:
categorical_feature = df.dtypes==object
final_categorical_feature = df.columns[categorical_feature].tolist()

numeric_feature = df.dtypes!=object
final_numeric_feature = df.columns[numeric_feature].tolist()

In [20]:
for i in range(len(final_categorical_feature)):
    print("\n************ {} ************".format(final_categorical_feature[i]))
    print(df[final_categorical_feature[i]].unique())


************ gender ************
['Female' 'Male']

************ partner ************
['Yes' 'No']

************ dependents ************
['No' 'Yes']

************ phoneservice ************
['No' 'Yes']

************ multiplelines ************
['No phone service' 'No' 'Yes']

************ internetservice ************
['DSL' 'Fiber optic' 'No']

************ onlinesecurity ************
['No' 'Yes' 'No internet service']

************ onlinebackup ************
['Yes' 'No' 'No internet service']

************ deviceprotection ************
['No' 'Yes' 'No internet service']

************ techsupport ************
['No' 'Yes' 'No internet service']

************ streamingtv ************
['No' 'Yes' 'No internet service']

************ streamingmovies ************
['No' 'Yes' 'No internet service']

************ contract ************
['Month-to-month' 'One year' 'Two year']

************ paperlessbilling ************
['Yes' 'No']

************ paymentmethod ************
['Electronic check' '

In [22]:
df_dummy = pd.get_dummies(df, prefix=['contract', 'paymentmethod'],columns=['contract', 'paymentmethod'], drop_first=True)
df_dummy.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,...,streamingmovies,paperlessbilling,monthlycharges,totalcharges,churn,contract_One year,contract_Two year,paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,...,No,Yes,29.85,29,No,0,0,0,1,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,...,No,No,56.95,1889,No,1,0,0,0,1
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,...,No,Yes,53.85,108,Yes,0,0,0,0,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,...,No,No,42.3,1840,No,1,0,0,0,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,...,No,Yes,70.7,151,Yes,0,0,0,1,0


In [24]:
final_categorical_feature.remove('contract')
final_categorical_feature.remove('paymentmethod')

In [25]:
le = LabelEncoder() 
df_dummy[final_categorical_feature] = df_dummy[final_categorical_feature].apply(lambda col: le.fit_transform(col)) 
df_dummy.head(5)

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,...,streamingmovies,paperlessbilling,monthlycharges,totalcharges,churn,contract_One year,contract_Two year,paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0,0,1,0,1,0,1,0,0,2,...,0,1,29.85,29,0,0,0,0,1,0
1,1,0,0,0,34,1,0,0,2,0,...,0,0,56.95,1889,0,1,0,0,0,1
2,1,0,0,0,2,1,0,0,2,2,...,0,1,53.85,108,1,0,0,0,0,1
3,1,0,0,0,45,0,1,0,2,0,...,0,0,42.3,1840,0,1,0,0,0,0
4,0,0,0,0,2,1,0,1,0,0,...,0,1,70.7,151,1,0,0,0,1,0


In [26]:
df_dummy.loc[df.duplicated()].shape

(22, 23)

In [27]:
df_dummy = df_dummy.drop_duplicates()
df_dummy.shape

(7021, 23)

In [29]:
df_dummy.columns

Index(['gender', 'seniorcitizen', 'partner', 'dependents', 'tenure',
       'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'paperlessbilling', 'monthlycharges', 'totalcharges',
       'churn', 'contract_One year', 'contract_Two year',
       'paymentmethod_Credit card (automatic)',
       'paymentmethod_Electronic check', 'paymentmethod_Mailed check'],
      dtype='object')

In [30]:
df_dummy.to_csv('telecom_churn_preprocess_data.csv')