# Modeling

In [1]:
#import required packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import os,sys
sys.path.insert(0,'../scripts/')
from Randomforest_regressor import randomforestregressor

#### Fetch data

In [2]:
data = pd.read_csv('../data/merged_data.csv')

#### Our data's information

In [3]:
data.shape

(556, 21)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 556 entries, 0 to 555
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         556 non-null    int64  
 1   game_id            556 non-null    object 
 2   emotion_x          556 non-null    object 
 3   gender_x           556 non-null    object 
 4   race_x             556 non-null    object 
 5   emotion_y          556 non-null    object 
 6   gender_y           556 non-null    object 
 7   race_y             556 non-null    object 
 8   edges_pixel_count  556 non-null    float64
 9   object_count       556 non-null    int64  
 10  dom_color_1        556 non-null    object 
 11  dom_color_2        556 non-null    object 
 12  dom_prct_1         556 non-null    float64
 13  dom_prct_2         556 non-null    float64
 14  startX             539 non-null    float64
 15  startY             539 non-null    float64
 16  height             539 non

In [5]:
data.columns

Index(['Unnamed: 0', 'game_id', 'emotion_x', 'gender_x', 'race_x', 'emotion_y',
       'gender_y', 'race_y', 'edges_pixel_count', 'object_count',
       'dom_color_1', 'dom_color_2', 'dom_prct_1', 'dom_prct_2', 'startX',
       'startY', 'height', 'width', 'preview_link', 'ER', 'CTR'],
      dtype='object')

#### Remove unnecessary columns

In [6]:
data = data.drop(columns=['Unnamed: 0','game_id','emotion_y','gender_y','race_y','preview_link'])

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 556 entries, 0 to 555
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   emotion_x          556 non-null    object 
 1   gender_x           556 non-null    object 
 2   race_x             556 non-null    object 
 3   edges_pixel_count  556 non-null    float64
 4   object_count       556 non-null    int64  
 5   dom_color_1        556 non-null    object 
 6   dom_color_2        556 non-null    object 
 7   dom_prct_1         556 non-null    float64
 8   dom_prct_2         556 non-null    float64
 9   startX             539 non-null    float64
 10  startY             539 non-null    float64
 11  height             539 non-null    float64
 12  width              539 non-null    float64
 13  ER                 556 non-null    float64
 14  CTR                556 non-null    float64
dtypes: float64(9), int64(1), object(5)
memory usage: 65.3+ KB


In [8]:
data.head()

Unnamed: 0,emotion_x,gender_x,race_x,edges_pixel_count,object_count,dom_color_1,dom_color_2,dom_prct_1,dom_prct_2,startX,startY,height,width,ER,CTR
0,neutral,Woman,white,21191.0,5,#F5CEAC,#FF0015,0.508871,0.065859,149.0,806.0,244.0,244.0,0.136963,0.005393
1,happy,Woman,white,41925.0,14,#496577,#004369,0.120734,0.111461,5.0,580.0,323.0,323.0,0.114208,0.016443
2,neutral,Man,white,27830.0,1,#D4D2C3,#7D8282,0.118827,0.10288,106.0,16.0,422.0,422.0,0.037748,0.004508
3,,,,27219.0,1,#13130D,#081F2C,0.491751,0.090518,630.0,411.0,303.0,303.0,0.031971,1.5e-05
4,,,,9009.0,2,#000000,#D7D6CD,0.242684,0.209823,11.0,11.0,600.0,600.0,0.067083,0.012776


#### Convert categorical data into numeric data uing label encoder

In [9]:
non_numeric_columns = list(data.select_dtypes(exclude=[np.number]).columns)
non_numeric_columns

['emotion_x', 'gender_x', 'race_x', 'dom_color_1', 'dom_color_2']

In [10]:
le = LabelEncoder()
for col in non_numeric_columns:
    data[col] = le.fit_transform(data[col])

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 556 entries, 0 to 555
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   emotion_x          556 non-null    int32  
 1   gender_x           556 non-null    int32  
 2   race_x             556 non-null    int32  
 3   edges_pixel_count  556 non-null    float64
 4   object_count       556 non-null    int64  
 5   dom_color_1        556 non-null    int32  
 6   dom_color_2        556 non-null    int32  
 7   dom_prct_1         556 non-null    float64
 8   dom_prct_2         556 non-null    float64
 9   startX             539 non-null    float64
 10  startY             539 non-null    float64
 11  height             539 non-null    float64
 12  width              539 non-null    float64
 13  ER                 556 non-null    float64
 14  CTR                556 non-null    float64
dtypes: float64(9), int32(5), int64(1)
memory usage: 54.4 KB


#### Handel missing values

In [12]:
data.isna().sum()

emotion_x             0
gender_x              0
race_x                0
edges_pixel_count     0
object_count          0
dom_color_1           0
dom_color_2           0
dom_prct_1            0
dom_prct_2            0
startX               17
startY               17
height               17
width                17
ER                    0
CTR                   0
dtype: int64

In [13]:
data = data.dropna()

In [14]:
data.isna().sum()

emotion_x            0
gender_x             0
race_x               0
edges_pixel_count    0
object_count         0
dom_color_1          0
dom_color_2          0
dom_prct_1           0
dom_prct_2           0
startX               0
startY               0
height               0
width                0
ER                   0
CTR                  0
dtype: int64

#### Split feature and target columns

In [15]:
x = data.drop(columns=['ER','CTR'])
y= data['ER']

In [16]:
x.head()

Unnamed: 0,emotion_x,gender_x,race_x,edges_pixel_count,object_count,dom_color_1,dom_color_2,dom_prct_1,dom_prct_2,startX,startY,height,width
0,4,2,5,21191.0,5,154,201,0.508871,0.065859,149.0,806.0,244.0,244.0
1,3,2,5,41925.0,14,82,4,0.120734,0.111461,5.0,580.0,323.0,323.0
2,4,0,5,27830.0,1,125,128,0.118827,0.10288,106.0,16.0,422.0,422.0
3,0,1,0,27219.0,1,42,38,0.491751,0.090518,630.0,411.0,303.0,303.0
4,0,1,0,9009.0,2,0,175,0.242684,0.209823,11.0,11.0,600.0,600.0


In [17]:
y.head()

0    0.136963
1    0.114208
2    0.037748
3    0.031971
4    0.067083
Name: ER, dtype: float64

#### Fit the model

In [18]:
model = randomforestregressor(x,y,n_estimators=500,max_depth=120,random_state=8)

StandardScaler()


#### Model predictions

In [19]:
model[0]

array([0.0891104 , 0.15047236, 0.11642611, 0.11555006, 0.10220153,
       0.13463345, 0.12082843, 0.08588283, 0.1426029 , 0.14485303,
       0.06397299, 0.08069537, 0.09343044, 0.14409   , 0.11260262,
       0.08486598, 0.19767645, 0.10079938, 0.08475763, 0.16840381,
       0.24868936, 0.09624131, 0.15303692, 0.08694406, 0.15985518,
       0.10022664, 0.08467822, 0.08609769, 0.16767887, 0.0890647 ,
       0.04334945, 0.33394337, 0.1267619 , 0.07035087, 0.20881288,
       0.17844853, 0.21552943, 0.12072724, 0.10350295, 0.04038511,
       0.16783114, 0.11598476, 0.21745327, 0.2758834 , 0.03349684,
       0.2634598 , 0.10865626, 0.15083221, 0.15866863, 0.2268697 ,
       0.12762902, 0.20479607, 0.06843038, 0.08366625, 0.15659028,
       0.07440146, 0.05261522, 0.18588022, 0.17109578, 0.15462985,
       0.06320078, 0.26563504, 0.2268697 , 0.11642611, 0.18128566,
       0.1291781 , 0.12172348, 0.0414547 , 0.05715286, 0.10182427,
       0.10333627, 0.09473445, 0.09456164, 0.09764748, 0.13712

#### Feature importances

In [20]:
model[1]

Unnamed: 0,feature,feature_importances
0,emotion_x,0.01124
1,gender_x,0.006979
2,race_x,0.016112
3,edges_pixel_count,0.146403
4,object_count,0.044811
5,dom_color_1,0.091061
6,dom_color_2,0.074575
7,dom_prct_1,0.128057
8,dom_prct_2,0.105474
9,startX,0.089729


#### Model score

In [21]:
model[2]

0.3580314426159956

#### Mean squared error

In [22]:
model[3]

0.004675060314229366