# Modeling

### Engagement Rate Score Prediction

In [1]:
#import required packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import os,sys
sys.path.insert(0,'../scripts/')
from Randomforest_regressor import randomforestregressor

#### Fetch data

In [2]:
data = pd.read_csv('../data/merged_data.csv')

#### Our data's information

In [3]:
data.shape

(556, 21)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 556 entries, 0 to 555
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         556 non-null    int64  
 1   game_id            556 non-null    object 
 2   emotion_x          556 non-null    object 
 3   gender_x           556 non-null    object 
 4   race_x             556 non-null    object 
 5   emotion_y          556 non-null    object 
 6   gender_y           556 non-null    object 
 7   race_y             556 non-null    object 
 8   edges_pixel_count  556 non-null    float64
 9   object_count       556 non-null    int64  
 10  dom_color_1        556 non-null    object 
 11  dom_color_2        556 non-null    object 
 12  dom_prct_1         556 non-null    float64
 13  dom_prct_2         556 non-null    float64
 14  startX             539 non-null    float64
 15  startY             539 non-null    float64
 16  height             539 non

In [5]:
data.columns

Index(['Unnamed: 0', 'game_id', 'emotion_x', 'gender_x', 'race_x', 'emotion_y',
       'gender_y', 'race_y', 'edges_pixel_count', 'object_count',
       'dom_color_1', 'dom_color_2', 'dom_prct_1', 'dom_prct_2', 'startX',
       'startY', 'height', 'width', 'preview_link', 'ER', 'CTR'],
      dtype='object')

#### Remove unnecessary columns

In [6]:
data = data.drop(columns=['Unnamed: 0','game_id','emotion_y','gender_y','race_y','preview_link'])

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 556 entries, 0 to 555
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   emotion_x          556 non-null    object 
 1   gender_x           556 non-null    object 
 2   race_x             556 non-null    object 
 3   edges_pixel_count  556 non-null    float64
 4   object_count       556 non-null    int64  
 5   dom_color_1        556 non-null    object 
 6   dom_color_2        556 non-null    object 
 7   dom_prct_1         556 non-null    float64
 8   dom_prct_2         556 non-null    float64
 9   startX             539 non-null    float64
 10  startY             539 non-null    float64
 11  height             539 non-null    float64
 12  width              539 non-null    float64
 13  ER                 556 non-null    float64
 14  CTR                556 non-null    float64
dtypes: float64(9), int64(1), object(5)
memory usage: 65.3+ KB


In [8]:
data.head()

Unnamed: 0,emotion_x,gender_x,race_x,edges_pixel_count,object_count,dom_color_1,dom_color_2,dom_prct_1,dom_prct_2,startX,startY,height,width,ER,CTR
0,neutral,Woman,white,21191.0,5,#F5CEAC,#FF0015,0.508871,0.065859,149.0,806.0,244.0,244.0,0.136963,0.005393
1,happy,Woman,white,41925.0,14,#496577,#004369,0.120734,0.111461,5.0,580.0,323.0,323.0,0.114208,0.016443
2,neutral,Man,white,27830.0,1,#D4D2C3,#7D8282,0.118827,0.10288,106.0,16.0,422.0,422.0,0.037748,0.004508
3,,,,27219.0,1,#13130D,#081F2C,0.491751,0.090518,630.0,411.0,303.0,303.0,0.031971,1.5e-05
4,,,,9009.0,2,#000000,#D7D6CD,0.242684,0.209823,11.0,11.0,600.0,600.0,0.067083,0.012776


#### Convert categorical data into numeric data uing label encoder

In [9]:
non_numeric_columns = list(data.select_dtypes(exclude=[np.number]).columns)
non_numeric_columns

['emotion_x', 'gender_x', 'race_x', 'dom_color_1', 'dom_color_2']

In [10]:
le = LabelEncoder()
for col in non_numeric_columns:
    data[col] = le.fit_transform(data[col])

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 556 entries, 0 to 555
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   emotion_x          556 non-null    int32  
 1   gender_x           556 non-null    int32  
 2   race_x             556 non-null    int32  
 3   edges_pixel_count  556 non-null    float64
 4   object_count       556 non-null    int64  
 5   dom_color_1        556 non-null    int32  
 6   dom_color_2        556 non-null    int32  
 7   dom_prct_1         556 non-null    float64
 8   dom_prct_2         556 non-null    float64
 9   startX             539 non-null    float64
 10  startY             539 non-null    float64
 11  height             539 non-null    float64
 12  width              539 non-null    float64
 13  ER                 556 non-null    float64
 14  CTR                556 non-null    float64
dtypes: float64(9), int32(5), int64(1)
memory usage: 54.4 KB


#### Handel missing values

In [12]:
data.isna().sum()

emotion_x             0
gender_x              0
race_x                0
edges_pixel_count     0
object_count          0
dom_color_1           0
dom_color_2           0
dom_prct_1            0
dom_prct_2            0
startX               17
startY               17
height               17
width                17
ER                    0
CTR                   0
dtype: int64

In [13]:
data = data.dropna()

In [14]:
data.isna().sum()

emotion_x            0
gender_x             0
race_x               0
edges_pixel_count    0
object_count         0
dom_color_1          0
dom_color_2          0
dom_prct_1           0
dom_prct_2           0
startX               0
startY               0
height               0
width                0
ER                   0
CTR                  0
dtype: int64

#### Split feature and target columns

In [15]:
x = data.drop(columns=['ER','CTR'])
y= data['ER']

In [16]:
x.head()

Unnamed: 0,emotion_x,gender_x,race_x,edges_pixel_count,object_count,dom_color_1,dom_color_2,dom_prct_1,dom_prct_2,startX,startY,height,width
0,4,2,5,21191.0,5,154,201,0.508871,0.065859,149.0,806.0,244.0,244.0
1,3,2,5,41925.0,14,82,4,0.120734,0.111461,5.0,580.0,323.0,323.0
2,4,0,5,27830.0,1,125,128,0.118827,0.10288,106.0,16.0,422.0,422.0
3,0,1,0,27219.0,1,42,38,0.491751,0.090518,630.0,411.0,303.0,303.0
4,0,1,0,9009.0,2,0,175,0.242684,0.209823,11.0,11.0,600.0,600.0


In [17]:
y.head()

0    0.136963
1    0.114208
2    0.037748
3    0.031971
4    0.067083
Name: ER, dtype: float64

#### Fit the model

In [18]:
model = randomforestregressor(x,y,n_estimators=500,max_depth=120,random_state=8)

StandardScaler()


#### Model predictions

In [19]:
model[0]

array([0.0891104 , 0.15047236, 0.11642611, 0.11555006, 0.10220153,
       0.13463345, 0.12082843, 0.08588283, 0.1426029 , 0.14485303,
       0.06397299, 0.08069537, 0.09343044, 0.14409   , 0.11260262,
       0.08486598, 0.19767645, 0.10079938, 0.08475763, 0.16840381,
       0.24868936, 0.09624131, 0.15303692, 0.08694406, 0.15985518,
       0.10022664, 0.08467822, 0.08609769, 0.16767887, 0.0890647 ,
       0.04334945, 0.33394337, 0.1267619 , 0.07035087, 0.20881288,
       0.17844853, 0.21552943, 0.12072724, 0.10350295, 0.04038511,
       0.16783114, 0.11598476, 0.21745327, 0.2758834 , 0.03349684,
       0.2634598 , 0.10865626, 0.15083221, 0.15866863, 0.2268697 ,
       0.12762902, 0.20479607, 0.06843038, 0.08366625, 0.15659028,
       0.07440146, 0.05261522, 0.18588022, 0.17109578, 0.15462985,
       0.06320078, 0.26563504, 0.2268697 , 0.11642611, 0.18128566,
       0.1291781 , 0.12172348, 0.0414547 , 0.05715286, 0.10182427,
       0.10333627, 0.09473445, 0.09456164, 0.09764748, 0.13712

#### Feature importances

In [20]:
model[1]

Unnamed: 0,feature,feature_importances
0,emotion_x,0.01124
1,gender_x,0.006979
2,race_x,0.016112
3,edges_pixel_count,0.146403
4,object_count,0.044811
5,dom_color_1,0.091061
6,dom_color_2,0.074575
7,dom_prct_1,0.128057
8,dom_prct_2,0.105474
9,startX,0.089729


#### Model score

In [21]:
model[2]

0.3580314426159956

#### Mean squared error

In [22]:
model[3]

0.004675060314229366

### Click-through Rate Score Prediction

#### Fetch endframe data

In [3]:
endframe_data = pd.read_csv('../data/endframe_merged_data.csv')

#### Our endframe data's information

In [4]:
endframe_data.shape


(385, 20)

In [5]:
endframe_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385 entries, 0 to 384
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    385 non-null    int64  
 1   game_id       385 non-null    object 
 2   emotion_x     385 non-null    object 
 3   gender_x      385 non-null    object 
 4   race_x        385 non-null    object 
 5   emotion_y     385 non-null    object 
 6   gender_y      385 non-null    object 
 7   race_y        385 non-null    object 
 8   object_count  385 non-null    int64  
 9   dom_color_1   385 non-null    object 
 10  dom_color_2   385 non-null    object 
 11  dom_prct_1    385 non-null    float64
 12  dom_prct_2    385 non-null    float64
 13  startX        384 non-null    float64
 14  startY        384 non-null    float64
 15  height        384 non-null    float64
 16  width         384 non-null    float64
 17  preview_link  385 non-null    object 
 18  ER            385 non-null    

In [6]:
endframe_data.columns

Index(['Unnamed: 0', 'game_id', 'emotion_x', 'gender_x', 'race_x', 'emotion_y',
       'gender_y', 'race_y', 'object_count', 'dom_color_1', 'dom_color_2',
       'dom_prct_1', 'dom_prct_2', 'startX', 'startY', 'height', 'width',
       'preview_link', 'ER', 'CTR'],
      dtype='object')

#### Remove unnecessary columns

In [7]:
endframe_data = endframe_data.drop(columns=['Unnamed: 0','game_id','emotion_y','gender_y','race_y','preview_link'])

In [8]:
endframe_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385 entries, 0 to 384
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   emotion_x     385 non-null    object 
 1   gender_x      385 non-null    object 
 2   race_x        385 non-null    object 
 3   object_count  385 non-null    int64  
 4   dom_color_1   385 non-null    object 
 5   dom_color_2   385 non-null    object 
 6   dom_prct_1    385 non-null    float64
 7   dom_prct_2    385 non-null    float64
 8   startX        384 non-null    float64
 9   startY        384 non-null    float64
 10  height        384 non-null    float64
 11  width         384 non-null    float64
 12  ER            385 non-null    float64
 13  CTR           385 non-null    float64
dtypes: float64(8), int64(1), object(5)
memory usage: 42.2+ KB


#### Convert categorical data into numeric data uing label encoder

In [9]:
endframe_non_numeric_columns = list(endframe_data.select_dtypes(exclude=[np.number]).columns)
endframe_non_numeric_columns

['emotion_x', 'gender_x', 'race_x', 'dom_color_1', 'dom_color_2']

In [10]:
le2 = LabelEncoder()
for col in endframe_non_numeric_columns:
    endframe_data[col] = le2.fit_transform(endframe_data[col])

In [11]:
endframe_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385 entries, 0 to 384
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   emotion_x     385 non-null    int32  
 1   gender_x      385 non-null    int32  
 2   race_x        385 non-null    int32  
 3   object_count  385 non-null    int64  
 4   dom_color_1   385 non-null    int32  
 5   dom_color_2   385 non-null    int32  
 6   dom_prct_1    385 non-null    float64
 7   dom_prct_2    385 non-null    float64
 8   startX        384 non-null    float64
 9   startY        384 non-null    float64
 10  height        384 non-null    float64
 11  width         384 non-null    float64
 12  ER            385 non-null    float64
 13  CTR           385 non-null    float64
dtypes: float64(8), int32(5), int64(1)
memory usage: 34.7 KB


#### Handel missing values

In [12]:
endframe_data.isna().sum()

emotion_x       0
gender_x        0
race_x          0
object_count    0
dom_color_1     0
dom_color_2     0
dom_prct_1      0
dom_prct_2      0
startX          1
startY          1
height          1
width           1
ER              0
CTR             0
dtype: int64

In [15]:
endframe_data = endframe_data.dropna()

In [16]:
endframe_data.isna().sum()

emotion_x       0
gender_x        0
race_x          0
object_count    0
dom_color_1     0
dom_color_2     0
dom_prct_1      0
dom_prct_2      0
startX          0
startY          0
height          0
width           0
ER              0
CTR             0
dtype: int64

#### Split feature and target columns

In [17]:
x2 = endframe_data.drop(columns=['ER','CTR'])
y2 = endframe_data['CTR']

In [18]:
x2.head()

Unnamed: 0,emotion_x,gender_x,race_x,object_count,dom_color_1,dom_color_2,dom_prct_1,dom_prct_2,startX,startY,height,width
0,0,1,0,78,99,0,0.875966,0.035697,173.0,785.0,300.0,300.0
1,0,1,0,158,99,0,0.866252,0.037257,173.0,785.0,300.0,300.0
2,0,1,0,451,91,0,0.889795,0.03348,365.0,777.0,234.0,234.0
3,0,1,0,15,89,110,0.222838,0.207252,214.0,810.0,227.0,227.0
4,4,0,5,1,18,2,0.141867,0.104907,903.0,741.0,206.0,206.0


In [19]:
y2.head()

0    0.058438
1    0.074731
2    0.042228
3    0.005393
4    0.016443
Name: CTR, dtype: float64

#### Fit the model

In [20]:
endframe_model = randomforestregressor(x2,y2,n_estimators=500,max_depth=120,random_state=8)

StandardScaler()


#### Model predictions

In [21]:
endframe_model[0]

array([0.00236649, 0.0238064 , 0.00848724, 0.01588163, 0.00265142,
       0.02804627, 0.00236649, 0.03315634, 0.07296094, 0.03332906,
       0.04099702, 0.02533255, 0.06714124, 0.06607014, 0.06419445,
       0.01234454, 0.00404745, 0.00410506, 0.02353322, 0.00236649,
       0.04388383, 0.01498645, 0.10086255, 0.0459306 , 0.04727907,
       0.09981917, 0.01310021, 0.01289064, 0.06118649, 0.04388759,
       0.02309138, 0.04500664, 0.03445127, 0.03713485, 0.04368876,
       0.00239841, 0.03865572, 0.04796845, 0.01537145, 0.06937032,
       0.01057436, 0.02333282, 0.00265142, 0.06612431, 0.03526535,
       0.01438797, 0.00265142, 0.00236649, 0.00571703, 0.0796086 ,
       0.02980413, 0.00830014, 0.06687643, 0.02317592, 0.04185273,
       0.04978102, 0.02665006, 0.01461278, 0.05903136, 0.00265142,
       0.0275821 , 0.05111829, 0.02105236, 0.04216839, 0.0023674 ,
       0.00341859, 0.00265142, 0.02311809, 0.01846176, 0.0047299 ,
       0.00236649, 0.07958936, 0.04433417, 0.02098089, 0.00236

#### Feature importances

In [22]:
endframe_model[1]

Unnamed: 0,feature,feature_importances
0,emotion_x,0.040464
1,gender_x,0.003525
2,race_x,0.021416
3,object_count,0.060888
4,dom_color_1,0.081975
5,dom_color_2,0.11284
6,dom_prct_1,0.123815
7,dom_prct_2,0.173151
8,startX,0.108326
9,startY,0.193666


#### Model score

In [23]:
endframe_model[2]

0.40983247509131115

#### Mean squared error

In [24]:
endframe_model[3]

0.0017068414372723125