In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [115]:
dataset = pd.read_json('dev_akademi.json')

In [116]:
dataset = dataset[['ad_bid_price_kurus','ad_call_to_action', 'ad_daily_budget_kurus', 'ad_title', 'ad_id', 'ad_description', 'event_type', 'event_category']]

In [117]:
# Data Preprocessing

In [118]:
df = dataset.dropna()
df = df[df.ad_call_to_action != ""]
df = df[df.event_type != ""]
df = df[df.event_category != ""]

In [119]:
# Feature Engineering

In [120]:
modify_df = pd.concat([df, pd.get_dummies(df.event_type, prefix='event')], axis=1)
modify_df = pd.concat([modify_df, pd.get_dummies(df.event_category, prefix='event_cat')], axis=1)

In [121]:
# word count
call_to_action_wc = modify_df.ad_call_to_action.apply(lambda x: len(x.split()))
description_wc = modify_df.ad_description.apply(lambda x: len(x.split()))
title_wc = modify_df.ad_title.apply(lambda x: len(x.split()))
modify_df = modify_df.assign(call_to_action_wc=call_to_action_wc)
modify_df = modify_df.assign(description_wc=description_wc)
modify_df = modify_df.assign(title_wc=title_wc)

In [122]:
cleaned_df = modify_df.drop(['ad_title', 'ad_description', 'event_type', 'event_category'], axis=1)

In [123]:
cleaned_df

Unnamed: 0,ad_bid_price_kurus,ad_call_to_action,ad_daily_budget_kurus,ad_id,event_CLICK,event_IMPRESSION,event_cat_Emlak,event_cat_Hayvanlar Alemi,event_cat_Vasıta,"event_cat_Yedek Parça, Aksesuar, Donanım & Tuning",event_cat_İkinci El ve Sıfır Alışveriş,event_cat_İş Makineleri & Sanayi,call_to_action_wc,description_wc,title_wc
0,1000.0,8b36e9 6f041af,200000.0,10,0,1,0,0,0,0,1,0,2,9,6
1,160.0,362131e 862,2000.0,13,0,1,1,0,0,0,0,0,2,9,4
2,160.0,362131e 862,2000.0,13,0,1,1,0,0,0,0,0,2,9,4
3,1000.0,8b36e9 6f041af,200000.0,10,0,1,0,0,0,0,1,0,2,9,6
4,1000.0,8b36e9 6f041af,200000.0,10,0,1,0,0,0,0,1,0,2,9,6
5,1000.0,8b36e9 6f041af,200000.0,10,0,1,0,0,0,0,1,0,2,9,6
6,120.0,1d4a27,120000.0,16,0,1,1,0,0,0,0,0,1,0,5
7,120.0,1d4a27,120000.0,16,0,1,1,0,0,0,0,0,1,6,5
8,260.0,bfff8af3f7609621ceb6,4000.0,18,0,1,1,0,0,0,0,0,1,12,11
9,400.0,9585a c28a 03715bdcf,40000.0,14,0,1,0,0,1,0,0,0,3,11,4


In [124]:
grouped_df = cleaned_df.groupby('ad_id')['event_CLICK', 'event_IMPRESSION'].sum()

In [125]:
conversion_rates = ((grouped_df.event_CLICK/grouped_df.event_IMPRESSION)*100).reset_index()
conversion_rates.columns = ['ad_id', 'rate']
conversion_rates = conversion_rates.set_index('ad_id')

In [126]:
indexed_df = cleaned_df.set_index('ad_id')

In [127]:
cleaned_df.ad_id.value_counts()

30     5978
153    4791
105    3792
71     3162
94     2874
11     2693
46     2366
49     2362
26     2346
61     1986
67     1985
85     1945
15     1936
3      1850
152    1699
24     1633
95     1508
32     1431
140    1290
162    1204
90     1190
111    1183
6      1107
107    1098
54     1079
74     1078
16     1064
177    1029
139     984
164     976
       ... 
170      88
193      86
186      86
51       85
163      84
147      84
149      83
12       82
133      82
190      81
197      78
173      77
97       77
72       76
145      75
154      72
200      68
179      68
109      67
37       66
150      65
58       63
151      61
146      53
199      47
194      46
55       45
178      43
185      41
76       41
Name: ad_id, Length: 188, dtype: int64

In [128]:
joined_df = indexed_df.join(conversion_rates)
joined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101189 entries, 1 to 200
Data columns (total 15 columns):
ad_bid_price_kurus                                   101189 non-null float64
ad_call_to_action                                    101189 non-null object
ad_daily_budget_kurus                                101189 non-null float64
event_CLICK                                          101189 non-null uint8
event_IMPRESSION                                     101189 non-null uint8
event_cat_Emlak                                      101189 non-null uint8
event_cat_Hayvanlar Alemi                            101189 non-null uint8
event_cat_Vasıta                                     101189 non-null uint8
event_cat_Yedek Parça, Aksesuar, Donanım & Tuning    101189 non-null uint8
event_cat_İkinci El ve Sıfır Alışveriş               101189 non-null uint8
event_cat_İş Makineleri & Sanayi                     101189 non-null uint8
call_to_action_wc                                    101189 no

In [112]:
# joined_df.drop_duplicates().sort_values(by=['rate'], ascending=False).rate.hist(bins=6) # over 2% is

In [130]:
to_scale_df = joined_df.drop(['ad_call_to_action', 'event_CLICK', 'event_IMPRESSION'], axis=1)

In [184]:
to_scale_df = to_scale_df.drop_duplicates()

In [185]:
to_scale_df

Unnamed: 0_level_0,ad_bid_price_kurus,ad_daily_budget_kurus,event_cat_Emlak,event_cat_Hayvanlar Alemi,event_cat_Vasıta,"event_cat_Yedek Parça, Aksesuar, Donanım & Tuning",event_cat_İkinci El ve Sıfır Alışveriş,event_cat_İş Makineleri & Sanayi,call_to_action_wc,description_wc,title_wc,rate
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,140.0,2000.0,1,0,0,0,0,0,2,8,6,1.754386
1,140.0,2000.0,1,0,0,0,0,0,2,0,6,1.754386
1,120.0,1200.0,1,0,0,0,0,0,2,8,6,1.754386
1,140.0,0.0,1,0,0,0,0,0,2,8,6,1.754386
1,120.0,2000.0,1,0,0,0,0,0,2,8,6,1.754386
2,200.0,2000.0,0,0,0,0,0,1,2,6,7,1.898734
3,200.0,40000.0,1,0,0,0,0,0,2,9,9,0.707676
3,200.0,40000.0,1,0,0,0,0,0,2,0,9,0.707676
3,160.0,40000.0,1,0,0,0,0,0,2,9,9,0.707676
4,360.0,8000.0,1,0,0,0,0,0,4,8,5,3.960396


In [216]:
from sklearn.preprocessing import MinMaxScaler

In [217]:
scaler = MinMaxScaler()

In [218]:
from sklearn.model_selection import train_test_split

In [219]:
X = to_scale_df.drop(['rate'], axis=1)
y = to_scale_df.rate

In [220]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [221]:
X_train.shape

(549, 11)

In [222]:
X_train = scaler.fit_transform(X_train)

  return self.partial_fit(X, y)


In [223]:
from time import gmtime, strftime
import pickle

In [224]:
print("Scaled attributes.")

time = strftime("%m%d%H%M%S", gmtime())
scaler_filename = ('min_max_scaler_{}.sav'.format(time))
pickle.dump(scaler, open(scaler_filename, 'wb'))

print("Scaler saved.")

Scaled attributes.
Scaler saved.


In [225]:
from sklearn.ensemble import RandomForestRegressor

In [226]:
from sklearn.model_selection import GridSearchCV

In [227]:
param_grid = [
    {'n_estimators': [3, 10], 'max_features': [2, 4, 'auto']}
]

random_regressor = RandomForestRegressor()
grid_search = GridSearchCV(random_regressor, param_grid, cv=5,
                           scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_features=2, n_estimators=3 ..................................
[CV] ................... max_features=2, n_estimators=3, total=   0.0s
[CV] max_features=2, n_estimators=3 ..................................
[CV] ................... max_features=2, n_estimators=3, total=   0.0s
[CV] max_features=2, n_estimators=3 ..................................
[CV] ................... max_features=2, n_estimators=3, total=   0.0s
[CV] max_features=2, n_estimators=3 ..................................
[CV] ................... max_features=2, n_estimators=3, total=   0.0s
[CV] max_features=2, n_estimators=3 ..................................
[CV] ................... max_features=2, n_estimators=3, total=   0.0s
[CV] max_features=2, n_estimators=10 .................................
[CV] .................. max_features=2, n_estimators=10, total=   0.0s
[CV] max_features=2, n_estimators=10 .................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] .................. max_features=4, n_estimators=10, total=   0.0s
[CV] max_features=4, n_estimators=10 .................................
[CV] .................. max_features=4, n_estimators=10, total=   0.0s
[CV] max_features=auto, n_estimators=3 ...............................
[CV] ................ max_features=auto, n_estimators=3, total=   0.0s
[CV] max_features=auto, n_estimators=3 ...............................
[CV] ................ max_features=auto, n_estimators=3, total=   0.0s
[CV] max_features=auto, n_estimators=3 ...............................
[CV] ................ max_features=auto, n_estimators=3, total=   0.0s
[CV] max_features=auto, n_estimators=3 ...............................
[CV] ................ max_features=auto, n_estimators=3, total=   0.0s
[CV] max_features=auto, n_estimators=3 ...............................
[CV] ................ max_features=auto, n_estimators=3, total=   0.0s
[CV] max_features=auto, n_estimators=10 ..............................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10], 'max_features': [2, 4, 'auto']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=2)

In [228]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

1.4093825342766997 {'max_features': 2, 'n_estimators': 3}
1.1836414385183094 {'max_features': 2, 'n_estimators': 10}
1.356701225225918 {'max_features': 4, 'n_estimators': 3}
1.1938187905735855 {'max_features': 4, 'n_estimators': 10}
1.3931727191798515 {'max_features': 'auto', 'n_estimators': 3}
1.218919493050365 {'max_features': 'auto', 'n_estimators': 10}


In [230]:
# Manual test for two values
best_regressor = grid_search.best_estimator_
best_regressor.n_jobs = 1
best_regressor.fit(X_train, y_train)
print("Random Forest Model (Fine Tuned) is Trained")

Random Forest Model (Fine Tuned) is Trained


In [231]:
scaled_test_set = scaler.transform(X_test)

In [232]:
y_pred = best_regressor.predict(scaled_test_set)

In [233]:
from sklearn.metrics import r2_score, mean_absolute_error

In [234]:
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("R2: ", r2_score(y_test, y_pred))

MAE:  0.5266692980765836
R2:  0.6491855676432693


In [235]:
from sklearn.model_selection import cross_val_score

In [236]:
cross_val_score(best_regressor, scaled_test_set, y_test, cv=10)

array([ 0.26433762,  0.31460432,  0.16267298,  0.09747885, -4.14080199,
        0.10348082,  0.31942215, -0.76805716,  0.44969106, -4.43274286])

In [237]:
import pickle

#serializing our model to a file called model.pkl
pickle.dump(best_regressor, open("regression_model.pkl","wb"))

#loading a model from a file called model.pkl
model = pickle.load(open("regression_model.pkl","rb"))

In [238]:
model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [239]:
cross_val_score(model, scaled_test_set, y_test, cv=10)

array([ 0.27368468,  0.12517645,  0.67910858,  0.6136884 , -7.49992748,
        0.62854114,  0.44510045, -0.85526759,  0.72480682, -0.64754958])