In [39]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import MinMaxScaler

from feature_engine.encoding import OneHotEncoder, CountFrequencyEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [40]:
import warnings
warnings.filterwarnings(action='ignore')

In [41]:
# Import the data
data = pd.read_csv('train_F3fUq2S.csv')

In [42]:
X = data.drop(['click_rate','campaign_id','is_timer'], axis =1)
y = data['click_rate']

In [43]:
cat_vars = X.select_dtypes(include='O').columns.to_list()
num_vars = X.select_dtypes(exclude='O').columns.to_list()

In [44]:
oh_encoder_var = [var for var in num_vars if len(X[var].unique()) < 4]
cont_vars = [var for var in num_vars if len(X[var].unique()) > 20]
freq_enc_var = [var for var in num_vars if (len(X[var].unique())>4 and len(X[var].unique()) < 20)]

In [45]:
freq_enc_var.append('times_of_day')

In [46]:
data['times_of_day'].value_counts()

Evening    1317
Noon        447
Morning     124
Name: times_of_day, dtype: int64

### Create a Dataprocessing Pipeline

In [47]:
from sklearn.pipeline import Pipeline

In [48]:
encoder_pipeline = Pipeline([
    ('Frequency Encoder', CountFrequencyEncoder(variables=cat_vars,encoding_method='frequency')),
    ('MinMax Scaler', MinMaxScaler())
])

In [49]:
encoder_pipeline.fit(X,y)

Pipeline(steps=[('Frequency Encoder',
                 CountFrequencyEncoder(encoding_method='frequency',
                                       variables=['times_of_day'])),
                ('MinMax Scaler', MinMaxScaler())])

In [50]:
X_enc = pd.DataFrame(encoder_pipeline.transform(X),columns=X.columns)

In [51]:
X_enc.head()

Unnamed: 0,sender,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day,category,product,no_of_CTA,mean_CTA_len,is_image,is_personalised,is_quote,is_emoticons,is_discount,is_price,is_urgency,target_audience
0,0.2,0.261719,0.234236,0.124113,0.833333,1.0,0.270746,0.4,0.604651,0.061224,0.179012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.875
1,0.2,0.175781,0.057277,0.893617,0.833333,1.0,0.0,0.133333,0.255814,0.0,0.135802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625
2,0.2,0.195312,0.287353,0.042553,0.833333,1.0,0.270746,0.133333,0.255814,0.061224,0.141975,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0,1.0
3,0.2,0.253906,0.247684,0.092199,0.666667,0.0,1.0,1.0,0.209302,0.081633,0.148148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625
4,0.2,0.277344,0.224611,0.08156,0.833333,1.0,0.270746,0.4,0.604651,0.061224,0.191358,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.875


### Create a RandomForest Model

In [52]:
from sklearn.ensemble import RandomForestRegressor

In [53]:
param_grid = {
    'criterion': ['squared_error','absolute_error', 'poisson']
}

In [54]:
rf_model = RandomForestRegressor(oob_score=True)

In [55]:
cv_model = RandomizedSearchCV(estimator=rf_model,param_distributions=param_grid, verbose=2,cv=5)

In [56]:
cv_model.fit(X_enc,y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ............................criterion=squared_error; total time=   0.6s
[CV] END ............................criterion=squared_error; total time=   0.6s
[CV] END ............................criterion=squared_error; total time=   0.6s
[CV] END ............................criterion=squared_error; total time=   0.6s
[CV] END ............................criterion=squared_error; total time=   0.6s
[CV] END ...........................criterion=absolute_error; total time=  13.6s
[CV] END ...........................criterion=absolute_error; total time=  14.0s
[CV] END ...........................criterion=absolute_error; total time=  13.2s
[CV] END ...........................criterion=absolute_error; total time=  14.1s
[CV] END ...........................criterion=absolute_error; total time=  16.6s
[CV] END ..................................criterion=poisson; total time=   1.6s
[CV] END ..................................criter

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(oob_score=True),
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'poisson']},
                   verbose=2)

In [57]:
pd.DataFrame(cv_model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.588406,0.01125,0.008522,0.000109,squared_error,{'criterion': 'squared_error'},0.357792,0.241943,0.062097,0.260125,-0.552531,0.073885,0.327435,3
1,14.279446,1.192453,0.008423,0.000115,absolute_error,{'criterion': 'absolute_error'},0.318152,0.294886,0.058493,0.305593,-0.397572,0.11591,0.27418,1
2,1.657125,0.085916,0.010401,0.000155,poisson,{'criterion': 'poisson'},0.257305,0.30866,0.046554,0.032827,-0.257662,0.077537,0.200525,2


### Create Evalution Metrics

In [58]:
def evalution_metrics(data,y_actual, y_pred, model):
    
    y_predict = model.predict(data)
    
    rmse = np.sqrt(mean_squared_error(y_actual,y_predict))
    r_score = r2_score(y_true=y_actual, y_pred= y_predict)
    
    return {'rmse':rmse, 'r2_score':r_score}
    
    

In [59]:
y_predict = cv_model.predict(X_enc)

In [60]:
## Check the r2_score and RMSE:
evalution_metrics(data=X_enc,y_actual=y, y_pred=y_predict, model=cv_model)

{'rmse': 0.02076123838082154, 'r2_score': 0.9392036550337497}

### Create the Submission File

In [61]:
test_data = pd.read_csv('test_Bk2wfZ3.csv')

In [62]:
X_test = test_data.drop(['campaign_id','is_timer'], axis =1)

In [63]:
test_enc = encoder_pipeline.transform(X_test)

In [64]:
test_predict = cv_model.predict(test_enc)

In [65]:
result = pd.DataFrame()
result['campaign_id'] = test_data['campaign_id']
result['click_rate'] = np.round(test_predict,2)

In [66]:
result.head()

Unnamed: 0,campaign_id,click_rate
0,1889,0.06
1,1890,0.57
2,1891,0.2
3,1892,0.23
4,1893,0.14


In [67]:
result.to_csv('submission.csv',index=False)