In [None]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import RandomizedSearchCV
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,PowerTransformer,StandardScaler
from scipy.stats import chi2_contingency
%matplotlib inline
pd.pandas.set_option('display.max_columns',None)
pd.pandas.set_option('display.max_rows',None)

In [None]:
# Reading training and testing files
train_data = pd.read_csv('../input/churn-risk-rate-hackerearth-ml/train.csv')
test_data  = pd.read_csv('../input/churn-risk-rate-hackerearth-ml/test.csv')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
# Displaying total recorfds in training and testing data
print("There are {} number of rows and {} number of columns in training data".format(train_data.shape[0],train_data.shape[1]))
print("There are {} number of rows and {} number of columns in testing data".format(test_data.shape[0],test_data.shape[1]))

In [None]:
# Describing training data
train_data.describe()

In [None]:
train_data.info()

# Data Pre-processing

In [None]:
def check_null(train,test):
  ''' Checking null values in dataset using heatmap''' 
  plt.figure(figsize=(16,9))
  plt.subplot(1,2,1)
  train_visual = sns.heatmap(train.isnull(),yticklabels=False,cmap='viridis')
  plt.subplot(1,2,2)
  test_visual  = sns.heatmap(test.isnull(),yticklabels=False,cmap='viridis')
  plt.show()

check_null(train_data,test_data)

**From the above heatmap, it appears that there are three columns with null values i.e. 'region_category', 'preferred_offer_types', 'points_in_wallet'. Also, it appears that 'region_category' column have higher null values in both dataset while 'preferred_offer_types' column has the least.**

In [None]:
def visualize_null_relationship(train):
    '''visualize the relationship of null values with target variables'''
    features_with_nan = [features for features in train.columns if train[features].isnull().sum()>=1]
    for feature in features_with_nan:
      data = train.copy()
      data[feature] = np.where(data[feature].isnull(), 1,0)
      data.groupby(feature)['churn_risk_score'].median().plot.bar()
      plt.title(feature)
      plt.show()

visualize_null_relationship(train_data)

In [None]:
# Analysing numerical variables
numerical_features = [feature for feature in train_data.columns if train_data[feature].dtypes!='O']
print("The number of numerical features in training data is {}.".format(len(numerical_features)))
train_data[numerical_features].head()

In [None]:
# Discrete Features
discrete_features = [features for features in numerical_features if len(train_data[features].unique())<=25 and features not in ['churn_risk_score']]
print("The number of discrete features are {} ".format(len(discrete_features)))
train_data[discrete_features].head()

In [None]:
# Relationship between cont. and target features
for feature in numerical_features:
  data = train_data.copy()
  data[feature].hist(bins=45)
  plt.xlabel(feature)
  plt.ylabel('Churn_Risk')
  plt.show()


**From visualization, it appears that some of the numerical features are skewed. Also, the target variable contains outliers.**

In [None]:
#Detecting Outliers with boxplot
for feature in numerical_features:
  data_copy = train_data.copy()
  data_copy.boxplot(column=feature)
  plt.title(feature)
  plt.show()

In [None]:
#Further, visualizing outliers with scatter plot
for feature in numerical_features:
    data_copy = train_data.copy()
    plt.scatter(data_copy[feature],data_copy['churn_risk_score'])
    plt.xlabel(feature)
    plt.ylabel('Churn_Risk')
    plt.title(feature)
    plt.show()

**From the scatter plot and boxplot it appears that most of the columns contains outliers even the target variable with churn rate= -1.**

In [None]:
# Computing the correlation b/w the features
plt.figure(figsize=(10,9))
sns.heatmap(train_data[numerical_features].corr(),annot=True)

**It appears that no columns have a relationship.**

In [None]:
# Analysing Categorical Features 
categorical_features = [feature for feature in train_data.columns if feature not in numerical_features]
print("Total number of categorical features are {}".format(len(categorical_features)))
train_data[categorical_features].head()

In [None]:
# Determining the cadinality of features
for features in categorical_features:
  print("The name of the features is {} and its cardinality is {} ".format(features,len(train_data[features].unique())))

**There are most of the columns with high cardinality.**

In [None]:
# Visualizing relationship of categorical variables with target
for feature in categorical_features:
    data_copy = train_data.copy()
    print(feature,chi2_contingency(pd.crosstab(data_copy[feature],train_data['churn_risk_score'])))

**From statistics, features: region_category,membership_category,joined_through_reference, preferred_off_types,medium_of_operation, avg_frequency_login_days,offer_application_preference and feedback are related to target.**

In [None]:
# Plotting a countplot for columns having cardinality less than 10
card_less_than_10 = [feature for feature in categorical_features if len(train_data[feature].unique())<=10]
plt.figure(figsize=(30,9))
for i in range(0,len(card_less_than_10)):
    plt.subplot(4,3,i+1)
    sns.countplot(y=train_data[card_less_than_10[i]])

From the above, following observations can be deduced:
Gender: Both number of Males and Females are equal.

    There is an 'unknown' category in the column
Joined through referral : Equally people joined/not joined.

    ? represents about status of 4500 unknown people.
Internet Option : Contains equal no. of subscribers.

Past Complaints: Contains equal no. of people.

Region_category :

              Shows about 14000 people living in Town.
              Shows about 4500 people living in Villages.
Preferred offer types: Mopstly Contains equal no. of people distribution.

Used Special Discout :

         Shows 20000+ people applied for discount
         Shows about 16500 people didn't applied for 
Complaint_status:

             Shows most complaints are not applicable
Membership category :

Shows max. count of people with no and basic membership.
Shows least count for people with premium/platinum membership
Mode of Operation:

Shows most people use Desktop and smartfone for operation
Shows ? about 5800 unknown no of people.
Preferes offers: Shows max. count for people preferes offers.

Feedback:

      Most of the people had given negative comments 

In [None]:
train_data['churn_risk_score'].value_counts()

# Data Cleaning

In [None]:
# Converting records having churn rate == -1 to 1 .
training_data = train_data.copy()
training_data['churn_risk_score'] = training_data['churn_risk_score'].apply(lambda x:1 if x==-1 else 0 if x==5 else x)
training_data.shape

In [None]:
training_data['churn_risk_score'].value_counts().plot.bar()

In [None]:
training_data.head()

In [None]:
# Handling Misssing values in Numerical Variables
def missing_numerical(train,test,feature):
  median_value_train = train[feature].median()
  median_value_test  = test[feature].median()
  train[feature+'_nan'] = np.where(train[feature].isnull(),1,0)
  train[feature] = np.where(train[feature].isnull(),median_value_train,train[feature])
  test[feature+'_nan'] = np.where(test[feature].isnull(),1,0)
  test[feature]  = np.where(test[feature].isnull(),median_value_test,test[feature])
  return train,test

training_data,test_data = missing_numerical(training_data,test_data,'points_in_wallet')

In [None]:
# Dropping Id Column as it doesn't contributes in prediction
training_data = training_data.drop(['customer_id'],axis=1)
testing_data = test_data.copy()
testing_data = testing_data.drop(['customer_id'],axis=1)

In [None]:
# Handling Categorical Variables in dataset
categorical_var = ['region_category', 'preferred_offer_types','joined_through_referral','medium_of_operation']

def calc_mode(data,feature):
  return data[feature].mode()

def categorical_null(train,test,features):
  for feature in features[0:2]:
    train[feature+"_nan"] = np.where(train[feature].isnull(),1,0) 
    train[feature] = np.where(train[feature].isnull(),calc_mode(train,feature),train[feature])
    test[feature+"_nan"] = np.where(test[feature].isnull(),1,0)  
    test[feature] = np.where(test[feature].isnull(),calc_mode(test,feature),test[feature]) 


  for feature in features[3:]:
    train[feature+"_missing"] = np.where(train[feature]=="?",1,0)  
    train[feature] = np.where(train[feature]=='?',calc_mode(train,feature),train[feature])
    test[feature+"_missing"] = np.where(test[feature]=="?",1,0) 
    test[feature] = np.where(test[feature]=='?',calc_mode(test,feature),test[feature]) 

  return train,test

training_data,testing_data = categorical_null(training_data,testing_data,categorical_var)

In [None]:
training_data.isnull().any()

In [None]:
testing_data.isnull().any()

# Performing Feature Engineering

In [None]:
# Performing Feature Engineering on Numerical Variables [age	days_since_last_login	avg_time_spent	avg_transaction_value	points_in_wallet]
numerical_features = [feature for feature in training_data.columns if training_data[feature].dtypes!='O' and feature not in ['churn_risk_score']]
features = ["age","days_since_last_login","avg_time_spent"]
def encode_neg_val(data,feature):
  '''Handle negative values'''
  data[feature] = np.where(data[feature]<0,0,data[feature])
  return data[feature]

def dsl_eng(train,test,feature):
    ''' Feature Engineering Days Since Last Login '''
    train[feature] = encode_neg_val(train,feature)
    test[feature] = encode_neg_val(test,feature)
    return train,test

def ats_eng(train,test,feature):
    ''' Feature Engineering Avg. time spent '''
    train[feature] = encode_neg_val(train,feature)
    test[feature] =  encode_neg_val(test,feature)
    return train,test

def feature_eng_numerical(train,test,features):

  ''' Feature Engineering Numerical Columns'''
  train,test = dsl_eng(train,test,features[1])  #Days Since Last Login
  train,test = ats_eng(train,test,features[2])  # avg_time_spent
  return train,test

training_data,testing_data = feature_eng_numerical(training_data,testing_data,features)
  


In [None]:
training_data.head()

In [None]:
testing_data.head()

In [None]:
# Performing Feature Engineering on Categorical Variables 
categorical_feat = ['joining_date','avg_frequency_login_days',"referral_id"]


def jd_eng(train,test,feature):
  ''' Feature Eng. Joining date'''
  present_yr = 2021
  train[feature] = train[feature].str.split("-",expand=True)[0].astype(int)
  train[feature] = present_yr-train[feature]
  test[feature]  = test[feature].str.split("-",expand=True)[0].astype(int)
  test[feature]  = present_yr-test[feature]
  return train,test

def calc_login_act(data,feature):
 
  data[feature] = np.where(data[feature].str.contains('Error'),
                                             0.0,data[feature])
  return data[feature]

def alg_eng(train,test,feature):
  ''' Feature Eng. Avg Login Days'''
  train[feature] = calc_login_act(train,feature)
  test[feature] =  calc_login_act(test,feature)
  return train,test

def rid_eng(train,test,feature):
  ''' Feature Eng. Referral Id'''
  encoder = ce.CountEncoder()
  train[feature] = encoder.fit_transform(train[feature])
  test[feature] = encoder.transform(test[feature])
  train[feature] = train[feature].apply(lambda x:"Not_Referred" if x==1 else "Referred" if x<20 else "Unknown")
  test[feature] = test[feature].apply(lambda x:"Not_Referred" if x==1 else "Referred" if x<20 else "Unknown")
  return train,test

def categorical_eng(train,test,features):
  ''' Feature Engineering Categorical Variables'''

  train,test = jd_eng(train,test,features[0]) # joining_date
  train,test = alg_eng(train,test,features[1]) #avg_login_days
  train,test = rid_eng(train,test,features[2]) #referral_id
  return train,test

training_data,testing_data = categorical_eng(training_data,testing_data,categorical_feat)

In [None]:
training_data.head()

In [None]:
testing_data.head()

In [None]:
# Removing columns with high cardinality and less contribution
cols_to_remove = ['Name','security_no','last_visit_time']
training_data = training_data.drop(columns=cols_to_remove,axis=1)
testing_data = testing_data.drop(columns=cols_to_remove,axis=1)

In [None]:
# Seperating independent and dependent features
y = training_data['churn_risk_score']
X_train = training_data.drop(columns=['churn_risk_score'],axis=1)
X_test = testing_data.copy()

In [None]:
# Encoding Categorical Variables

cols_to_encode = ['membership_category',
              'gender','region_category','joined_through_referral',
               'preferred_offer_types','medium_of_operation','internet_option',
               'used_special_discount','offer_application_preference','past_complaint',
               'complaint_status','feedback',"referral_id"]
               

X_train_le = X_train.copy()
X_test_le  = X_test.copy()

def label_encode(data,features):
  dummies = data.copy()
  dummies = pd.get_dummies(dummies[features])
  data = pd.concat([data,dummies],axis=1)
  data = data.drop(columns=features,axis=1)

  return data

X_train_le = label_encode(X_train_le,cols_to_encode)
X_test_le  = label_encode(X_test_le,cols_to_encode)


In [None]:
X_train_le.head()

In [None]:
X_test_le.head()

# Feature Scaling

In [None]:
# First transforming numerical variables to gaussian curve using power transformer
pt = PowerTransformer(method='yeo-johnson',standardize=False)
cols_to_pt = ['age','points_in_wallet','avg_time_spent','avg_transaction_value']
X_train_pt = X_train_le.copy()
X_test_pt  = X_test_le.copy()
X_train_pt = pd.DataFrame(pt.fit_transform(X_train_pt[cols_to_pt]),columns=cols_to_pt)
X_train_transformed = X_train_le.drop(columns=cols_to_pt)
X_train_ptransformed = pd.concat([X_train_pt,X_train_transformed],axis=1)
X_test_pt  = pd.DataFrame(pt.transform(X_test_pt[cols_to_pt]),columns=cols_to_pt)
X_test_transformed = X_test_le.drop(columns=cols_to_pt)
X_test_ptransformed = pd.concat([X_test_transformed,X_test_pt],axis=1)

In [None]:
# Standardizing using Standard Scaler
sc = StandardScaler()
X_train_scaled = pd.DataFrame(sc.fit_transform(X_train_ptransformed),columns= X_train_ptransformed.columns)
X_test_scaled = pd.DataFrame(sc.fit_transform(X_test_ptransformed),columns = X_test_ptransformed.columns)

In [None]:
X_train_scaled.head()

In [None]:
X_test_scaled.head()

# Feature Selection

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model=ExtraTreesClassifier()
model.fit(X_train_scaled,y)

In [None]:
feat_importances=pd.Series(model.feature_importances_,index=X_train_scaled.columns)
feat_importances.nlargest(66).plot(kind='barh')

**We will keep all the features in our training and testing data since all the features are contributing in prediction.**

In [None]:
# Model Splitting
x_train,x_test,y_train,y_test = train_test_split(X_train_scaled,y,test_size=0.1,random_state=1,stratify=y)
x_train,x_valid,y_train,y_valid = train_test_split(x_train,y_train,test_size=0.1,random_state=1,stratify = y_train)

# Model Building

**1. Logistic Regression.**

In [None]:
model_1 = LogisticRegression(max_iter=400)
model_1.fit(x_train,y_train.values.ravel())
predictions_1 = model_1.predict(x_valid)
score_1 = f1_score(y_valid,predictions_1,average='macro')
score_1

In [None]:
predictions_1 = model_1.predict(x_test)
score_1 = f1_score(y_test,predictions_1,average='macro')
score_1

**2. KNN**

In [None]:
model_2 = KNeighborsClassifier()
model_2.fit(x_train,y_train)
predictions_2 = model_2.predict(x_valid)
score_2 = f1_score(y_valid,predictions_2,average='macro')
score_2

In [None]:
predictions_2 = model_2.predict(x_test)
score_2 = f1_score(y_test,predictions_2,average='macro')
score_2

**3. SVM**

In [None]:
model_3 = SVC()
model_3.fit(x_train,y_train.values.ravel())
predictions_3 = model_3.predict(x_valid)
score_3 = f1_score(y_valid,predictions_3,average='macro')
score_3

In [None]:
predictions_3 = model_3.predict(x_test)
score_3 = f1_score(y_test,predictions_3,average='macro')
score_3

**4. Decision Tree**

In [None]:
model_4 = DecisionTreeClassifier()
model_4.fit(x_train,y_train.values.ravel())
predictions_4 = model_4.predict(x_valid)
score_4 = f1_score(y_valid,predictions_4,average='macro')
score_4

In [None]:
predictions_4 = model_4.predict(x_test)
score_4 = f1_score(y_test,predictions_4,average='macro')
score_4

**5. Random Forest**

In [None]:
model_5 = RandomForestClassifier()
model_5.fit(x_train,y_train.values.ravel())
predictions_5 = model_5.predict(x_valid)
score_5 = f1_score(y_valid,predictions_5,average='macro')
score_5

In [None]:
predictions_5 = model_5.predict(x_test)
score_5 = f1_score(y_test,predictions_5,average='macro')
score_5

**6. XGBOOST**

In [None]:
from xgboost import XGBClassifier
model_6 = XGBClassifier()
model_6.fit(x_train,y_train.values.ravel())
predictions_6 = model_6.predict(x_valid)
score_6 = f1_score(y_valid,predictions_6,average='macro')
score_6

In [None]:
predictions_6 = model_6.predict(x_test)
score_6 = f1_score(y_test,predictions_6,average='macro')
score_6

7. CatBoost Classifier

In [None]:
import catboost as cb
cat_model = cb.CatBoostClassifier(verbose=2,iterations=90,depth=3,learning_rate=0.2,bagging_temperature=0.8,border_count=236,l2_leaf_reg=2)
cat_model.fit(x_train,y_train,eval_set=(x_valid,y_valid))
print(cat_model.best_score_)

In [None]:
predictions_8 = cat_model.predict(x_test)
score_8 = f1_score(y_test,predictions_8,average='macro')
score_8

In [None]:
# Classification Report
from sklearn.metrics import classification_report
cr = classification_report(y_test,predictions_8)
print(cr)

**Since,Catbbost Algorithm gives the higest f1-score, therefore we choose it as the final model for the prediction.**

# Prediction on Test Data

In [None]:
final_predictions = pd.DataFrame(cat_model.predict(X_test_scaled))
final_predictions.columns = ['churn_risk_score']
final_predictions = pd.concat([test_data["customer_id"],final_predictions],axis=1)
final_predictions.head()


In [None]:
final_predictions['churn_risk_score'].value_counts()

In [None]:
final_predictions['churn_risk_score'] = final_predictions['churn_risk_score'].apply(lambda x:5 if x==0 else x)