In [39]:
import pandas as pd

from sklearn.preprocessing import StandardScaler

# Load data

In [40]:
df_train = pd.read_parquet("data/df_train.parquet")
df_train

Unnamed: 0,id,professionality,visual,grammar,ai_likelihood,education,experience,languages,points_9m
0,CVANON0002,4.0,3.0,5.0,3.0,4.0,9.833333,8.0,3617.798
1,CVANON0247,7.0,6.0,8.0,4.0,12.0,3.083333,10.0,1271.650
2,CVANON0009,7.0,6.0,6.0,5.0,15.0,4.166667,11.0,2569.874
3,CVANON0238,7.0,6.0,5.0,4.0,24.0,10.500000,0.0,685.200
4,CVANON0269,7.0,6.0,8.0,3.0,12.0,8.000000,13.0,652.800
...,...,...,...,...,...,...,...,...,...
284,CVANON0318,5.0,4.0,3.0,6.0,6.0,13.916667,8.0,164.000
285,CVANON0210,8.0,7.0,6.0,5.0,7.0,284.000000,18.0,2292.700
286,CVANON0206,7.0,6.0,8.0,3.0,13.0,25.583333,17.0,82.500
287,CVANON0185,7.0,6.0,6.0,5.0,10.0,11.666667,3.0,130.500


In [41]:
df_predict = pd.read_parquet("data/df_predict.parquet")
df_predict

Unnamed: 0,id,professionality,visual,grammar,ai_likelihood,education,experience,languages
0,CVANON0002,3.0,4.0,5.0,1.0,5.0,0.000000,10.0
1,CVANON0009,3.0,4.0,6.0,2.0,10.0,0.000000,10.0
2,CVANON0033,6.0,5.0,8.0,3.0,8.0,5.000000,10.0
3,CVANON0010,6.0,5.0,6.0,5.0,10.0,0.333333,6.0
4,CVANON0015,3.0,4.0,3.0,6.0,3.0,0.000000,8.0
...,...,...,...,...,...,...,...,...
76,CVANON0018,8.0,6.0,7.0,4.0,25.0,31.416667,13.0
77,CVANON0019,6.0,5.0,4.0,3.0,13.0,22.083333,9.0
78,CVANON0070,6.0,5.0,5.0,4.0,5.0,5.333333,8.0
79,CVANON0017,7.0,5.0,4.0,3.0,11.0,23.583333,12.0


# Check nulls

In [42]:
df_train.isnull().sum()

id                  0
professionality     0
visual              0
grammar             0
ai_likelihood       0
education           0
experience          0
languages           0
points_9m          72
dtype: int64

In [43]:
df_train = df_train.dropna(subset=["points_9m"])

In [44]:
# Check if any column has null values
df_train.isnull().sum()

id                 0
professionality    0
visual             0
grammar            0
ai_likelihood      0
education          0
experience         0
languages          0
points_9m          0
dtype: int64

In [45]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 217 entries, 0 to 287
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               217 non-null    object 
 1   professionality  217 non-null    float64
 2   visual           217 non-null    float64
 3   grammar          217 non-null    float64
 4   ai_likelihood    217 non-null    float64
 5   education        217 non-null    float64
 6   experience       217 non-null    float64
 7   languages        217 non-null    float64
 8   points_9m        217 non-null    float64
dtypes: float64(8), object(1)
memory usage: 17.0+ KB


In [46]:
df_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               81 non-null     object 
 1   professionality  81 non-null     float64
 2   visual           81 non-null     float64
 3   grammar          81 non-null     float64
 4   ai_likelihood    81 non-null     float64
 5   education        81 non-null     float64
 6   experience       81 non-null     float64
 7   languages        81 non-null     float64
dtypes: float64(7), object(1)
memory usage: 5.2+ KB


# Scaling

In [47]:
scaler = StandardScaler()

In [48]:
df_train.iloc[:, 1:-1] = scaler.fit_transform(df_train.iloc[:, 1:-1])
df_train

Unnamed: 0,id,professionality,visual,grammar,ai_likelihood,education,experience,languages,points_9m
0,CVANON0002,-2.345824,-2.376936,-0.865650,-0.649081,-0.811672,-0.451910,-0.263813,3617.798
1,CVANON0247,0.371492,0.604539,1.070907,0.220367,0.313777,-0.658899,0.169880,1271.650
2,CVANON0009,0.371492,0.604539,-0.220131,1.089815,0.735821,-0.625678,0.386726,2569.874
3,CVANON0238,0.371492,0.604539,-0.865650,0.220367,2.001951,-0.431466,-1.998586,685.200
4,CVANON0269,0.371492,0.604539,1.070907,-0.649081,0.313777,-0.508129,0.820419,652.800
...,...,...,...,...,...,...,...,...,...
283,CVANON0311,0.371492,0.604539,1.070907,0.220367,1.720589,-0.416134,0.386726,41.600
284,CVANON0318,-1.440052,-1.383111,-2.156688,1.959263,-0.530310,-0.326694,-0.263813,164.000
285,CVANON0210,1.277264,1.598363,-0.220131,1.089815,-0.389628,7.955434,1.904652,2292.700
286,CVANON0206,0.371492,0.604539,1.070907,-0.649081,0.454458,0.031065,1.687805,82.500


In [49]:
df_predict.iloc[:, 1:] = scaler.transform(df_predict.iloc[:, 1:])
df_predict

Unnamed: 0,id,professionality,visual,grammar,ai_likelihood,education,experience,languages
0,CVANON0002,-3.251596,-1.383111,-0.865650,-2.387977,-0.670991,-0.753450,0.169880
1,CVANON0009,-3.251596,-1.383111,-0.220131,-1.518529,0.032415,-0.753450,0.169880
2,CVANON0033,-0.534280,-0.389286,1.070907,-0.649081,-0.248947,-0.600124,0.169880
3,CVANON0010,-0.534280,-0.389286,-0.220131,1.089815,0.032415,-0.743228,-0.697506
4,CVANON0015,-3.251596,-1.383111,-2.156688,1.959263,-0.952353,-0.753450,-0.263813
...,...,...,...,...,...,...,...,...
76,CVANON0018,1.277264,0.604539,0.425388,0.220367,2.142632,0.209945,0.820419
77,CVANON0019,-0.534280,-0.389286,-1.511169,-0.649081,0.454458,-0.076262,-0.046967
78,CVANON0070,-0.534280,-0.389286,-0.865650,0.220367,-0.670991,-0.589902,-0.263813
79,CVANON0017,0.371492,-0.389286,-1.511169,-0.649081,0.173096,-0.030265,0.603573


In [50]:
df_train.describe()

Unnamed: 0,professionality,visual,grammar,ai_likelihood,education,experience,languages,points_9m
count,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0
mean,-1.064177e-16,-4.133918e-16,-3.6836890000000003e-17,-1.146037e-16,1.227896e-16,-5.730183e-17,1.268826e-16,1068.521634
std,1.002312,1.002312,1.002312,1.002312,1.002312,1.002312,1.002312,1099.994636
min,-5.063141,-4.364585,-3.447726,-2.387977,-1.374396,-0.7534496,-1.998586,1.0
25%,-0.5342803,-0.3892862,-0.8656498,-0.649081,-0.6709907,-0.6103459,-0.4806598,161.7
50%,0.3714918,-0.3892862,-0.2201309,0.220367,-0.2489473,-0.3496927,0.1698798,670.265
75%,0.3714918,0.6045386,1.070907,1.089815,0.4544584,0.3070512,0.6035728,1706.2738
max,2.183036,2.592188,1.716426,2.828711,3.690125,7.955434,3.205731,5511.0


In [51]:
df_predict.describe()

Unnamed: 0,professionality,visual,grammar,ai_likelihood,education,experience,languages
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,-0.265903,-0.119358,-0.514998,0.005688,0.150518,-0.316693,0.185942
std,1.202022,1.006322,0.79748,1.015161,1.333034,0.541653,0.59955
min,-3.251596,-3.370761,-2.156688,-2.387977,-1.374396,-0.75345,-1.564892
25%,-0.53428,-0.389286,-0.86565,-0.649081,-0.670991,-0.702341,-0.263813
50%,0.371492,-0.389286,-0.86565,0.220367,-0.248947,-0.45191,0.16988
75%,0.371492,0.604539,-0.220131,1.089815,0.454458,-0.140148,0.603573
max,2.183036,1.598363,1.070907,2.828711,5.65966,2.68104,1.687805


# Export results

In [52]:
df_train.to_parquet("data/df_train_cleaned.parquet")
df_predict.to_parquet("data/df_predict_cleaned.parquet")