# Feature Engineering & Transformation

This section covers:
1. Normalization / scaling.
2. Feature selection & dimensionality reduction.
3. Derived feature creation (e.g., per-trait means, variance).
4. Optional numerosity reduction (sampling or PCA).

In [None]:
import pandas as pd

# Visualization
import seaborn as sns

# Preprocessing & stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression

# Settings
pd.set_option("display.max_columns", None)
sns.set(style="whitegrid")

In [None]:
import pandas as pd
import numpy as np

# Load feature-engineered dataset
df = pd.read_csv("../data/processed/feature_engineered.csv")

print("✅ Dataset loaded successfully")

✅ Dataset loaded successfully


In [None]:
# Dataset shape
print("Shape:", df.shape)

# Column names
print("\nColumns:")
for col in df.columns:
    print(col)

Shape: (1011478, 130)

Columns:
EXT1
EXT2
EXT3
EXT4
EXT5
EXT6
EXT7
EXT8
EXT9
EXT10
EST1
EST2
EST3
EST4
EST5
EST6
EST7
EST8
EST9
EST10
AGR1
AGR2
AGR3
AGR4
AGR5
AGR6
AGR7
AGR8
AGR9
AGR10
CSN1
CSN2
CSN3
CSN4
CSN5
CSN6
CSN7
CSN8
CSN9
CSN10
OPN1
OPN2
OPN3
OPN4
OPN5
OPN6
OPN7
OPN8
OPN9
OPN10
EXT1_E
EXT2_E
EXT3_E
EXT4_E
EXT5_E
EXT6_E
EXT7_E
EXT8_E
EXT9_E
EXT10_E
EST1_E
EST2_E
EST3_E
EST4_E
EST5_E
EST6_E
EST7_E
EST8_E
EST9_E
EST10_E
AGR1_E
AGR2_E
AGR3_E
AGR4_E
AGR5_E
AGR6_E
AGR7_E
AGR8_E
AGR9_E
AGR10_E
CSN1_E
CSN2_E
CSN3_E
CSN4_E
CSN5_E
CSN6_E
CSN7_E
CSN8_E
CSN9_E
CSN10_E
OPN1_E
OPN2_E
OPN3_E
OPN4_E
OPN5_E
OPN6_E
OPN7_E
OPN8_E
OPN9_E
OPN10_E
dateload
screenw
screenh
introelapse
testelapse
endelapse
IPC
country
lat_appx_lots_of_err
long_appx_lots_of_err
EXT
NEU
AGR
CON
OPN
EXT_bin
NEU_bin
AGR_bin
CON_bin
OPN_bin
EXT_mean
EXT_std
NEU_mean
NEU_std
AGR_mean
AGR_std
CON_mean
CON_std
OPN_mean
OPN_std


In [None]:
# Preview first 5 rows
df.head()

# Optional: random sample of 10 rows to see scaling and engineered features
df.sample(10, random_state=42)

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,EST1,EST2,EST3,EST4,EST5,EST6,EST7,EST8,EST9,EST10,AGR1,AGR2,AGR3,AGR4,AGR5,AGR6,AGR7,AGR8,AGR9,AGR10,CSN1,CSN2,CSN3,CSN4,CSN5,CSN6,CSN7,CSN8,CSN9,CSN10,OPN1,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7,OPN8,OPN9,OPN10,EXT1_E,EXT2_E,EXT3_E,EXT4_E,EXT5_E,EXT6_E,EXT7_E,EXT8_E,EXT9_E,EXT10_E,EST1_E,EST2_E,EST3_E,EST4_E,EST5_E,EST6_E,EST7_E,EST8_E,EST9_E,EST10_E,AGR1_E,AGR2_E,AGR3_E,AGR4_E,AGR5_E,AGR6_E,AGR7_E,AGR8_E,AGR9_E,AGR10_E,CSN1_E,CSN2_E,CSN3_E,CSN4_E,CSN5_E,CSN6_E,CSN7_E,CSN8_E,CSN9_E,CSN10_E,OPN1_E,OPN2_E,OPN3_E,OPN4_E,OPN5_E,OPN6_E,OPN7_E,OPN8_E,OPN9_E,OPN10_E,dateload,screenw,screenh,introelapse,testelapse,endelapse,IPC,country,lat_appx_lots_of_err,long_appx_lots_of_err,EXT,NEU,AGR,CON,OPN,EXT_bin,NEU_bin,AGR_bin,CON_bin,OPN_bin,EXT_mean,EXT_std,NEU_mean,NEU_std,AGR_mean,AGR_std,CON_mean,CON_std,OPN_mean,OPN_std
829475,1.862315,-1.348293,1.412432,-1.744888,1.351179,-1.150482,0.875684,-1.916391,1.514271,-0.43483,-0.218665,0.678415,-1.61057,1.866938,-0.669161,-0.640861,-0.821089,-1.261494,-0.84815,-1.349401,-0.194368,1.028874,1.361614,0.954918,-0.235922,0.205662,-1.074563,0.279594,0.175806,0.374237,-1.116678,0.767934,0.986827,-0.499617,-1.284107,0.82331,0.263468,1.34575,-0.163306,1.346037,0.294703,-0.072271,-0.007657,-0.003682,0.203701,0.999629,-0.99122,-0.952469,-2.078787,-0.947774,2604.0,1086.0,1392.0,757.0,2362.0,1219.0,6692.0,1784.0,1428.0,4672.0,2061.0,4126.0,1128.0,2907.0,2278.0,2197.0,821.0,2122.0,2675.0,1190.0,4786.0,1322.0,930.0,1436.0,7679.0,2373.0,2417.0,1138.0,1163.0,1909.0,1488.0,1565.0,2886.0,3826.0,2041.0,1809.0,1654.0,4584.0,9151.0,1950.0,1709.0,5104.0,3729.0,1956.0,1360.0,2050.0,4123.0,953.0,3604.0,1596.0,2018-07-21 02:32:52,1440.0,900.0,26.0,129.0,5,1,US,37.4135,-122.1312,-0.497428,1076.45,-0.663579,1549.4,-0.571043,Low,Low,Low,Low,Low,0.0421,1.504761,-0.487404,1.047542,0.287585,0.710004,0.246962,0.971594,-0.355583,0.880521
331567,0.274994,-1.348293,1.412432,-0.119509,-0.223345,-0.332238,-1.273134,1.248791,-1.46927,-0.43483,0.52833,1.497342,0.995003,1.065941,0.119077,0.115767,-1.598144,-1.261494,-0.84815,-1.349401,-0.942068,1.028874,-0.992226,0.954918,-1.092182,1.030858,-1.074563,1.202275,1.041452,1.309218,1.45083,0.045974,0.986827,-1.301251,0.290295,0.82331,0.263468,-1.293624,1.412424,0.385968,-2.32218,0.828834,-0.007657,-0.923157,-0.818406,-0.815405,0.015552,-1.753537,0.84706,-0.947774,9429.0,5864.0,2755.0,5417.0,2916.0,5416.0,5950.0,3249.0,7316.0,8150.0,8523.0,3081.0,3917.0,7254.0,5018.0,7084.0,5549.0,3915.0,2913.0,1884.0,3017.0,5586.0,1781.0,2115.0,4130.0,3702.0,3767.0,2817.0,3351.0,2565.0,7430.0,10552.0,6614.0,5147.0,3269.0,5615.0,3816.0,5718.0,11468.0,9954.0,3731.0,6259.0,3886.0,6331.0,4250.0,6866.0,6217.0,2466.0,3267.0,2769.0,2017-02-03 12:24:55,1280.0,1024.0,44.0,255.0,16,1,US,44.4984,-84.592,-0.116314,2458.35,-0.474776,3480.9,-0.04198,High,High,Low,High,High,-0.22644,1.004349,-0.073573,1.120841,0.246656,1.09992,0.306422,0.970914,-0.589667,1.026336
236917,0.274994,1.683593,0.583972,0.693181,-0.223345,0.486006,0.159411,1.248791,-0.723385,0.336442,0.52833,0.678415,0.126478,1.866938,-0.669161,-1.397489,-1.598144,-1.261494,-0.073486,-1.349401,-0.942068,0.142692,-0.992226,0.057798,1.476597,1.030858,0.715895,0.279594,1.041452,0.374237,0.594994,-1.397947,0.986827,-1.301251,1.077495,-1.301625,0.263468,-1.293624,-0.163306,1.346037,0.294703,1.729938,-0.007657,1.835269,-1.840513,-0.815405,0.015552,-0.952469,-1.103505,0.032768,4596.0,5386.0,1827.0,1658.0,3530.0,2301.0,3830.0,3186.0,5812.0,2386.0,4420.0,2666.0,2960.0,3796.0,3316.0,2213.0,2387.0,1730.0,3533.0,2219.0,5059.0,1846.0,2383.0,1743.0,5797.0,2453.0,6816.0,4376.0,4220.0,2431.0,4066.0,3044.0,1887.0,3207.0,3676.0,4019.0,2580.0,1739.0,1743.0,1728.0,4994.0,4230.0,2572.0,4304.0,3080.0,2044.0,4526.0,5467.0,6440.0,3630.0,2016-10-12 16:10:50,1920.0,1080.0,7.0,171.0,21,1,AU,-37.9367,145.0342,-0.373908,1463.3,-0.369172,1385.9,-0.168523,Low,Low,Medium,Low,Medium,0.451966,0.682833,-0.314901,1.136352,0.318483,0.812387,-0.118893,1.119056,-0.081132,1.181364
427063,-1.312327,0.925621,-1.901406,1.505871,-1.79787,2.122495,-1.273134,-1.916391,-1.46927,1.107713,1.275325,1.497342,0.995003,-1.337052,1.695552,1.629023,1.510078,1.726053,1.475842,1.684312,2.048733,-2.515854,0.577001,-2.633561,2.332857,-2.269927,2.506353,-2.488449,-2.421133,-2.430705,0.594994,-1.397947,-2.896236,1.905282,-1.284107,0.114998,-2.423076,0.465959,1.412424,-0.574102,-2.32218,2.631043,0.915549,2.754744,-2.86262,-0.815405,-3.004765,1.450737,0.84706,-2.908857,6121.0,3205.0,1948.0,2805.0,3500.0,3092.0,5844.0,2513.0,2622.0,1544.0,1467.0,1552.0,2365.0,2821.0,2380.0,2249.0,2787.0,5165.0,2003.0,3204.0,3950.0,1015.0,3268.0,8521.0,6100.0,4215.0,3796.0,7588.0,3007.0,3295.0,3724.0,3760.0,1439.0,3739.0,4236.0,7433.0,3896.0,1519.0,3612.0,2701.0,1908.0,3953.0,5879.0,71435.0,2420.0,4424.0,4480.0,4447.0,5489.0,2669.0,2017-06-04 18:21:32,1280.0,800.0,5.0,249.0,15,1,US,31.7613,-89.6551,-0.389511,1301.95,-0.181771,1804.3,1.585206,Low,Low,Medium,Medium,High,-0.40087,1.608019,1.215148,0.923977,-0.729468,2.292823,-0.408181,1.589393,-0.331469,2.328918
134363,1.068654,0.16765,0.583972,0.693181,-1.010608,0.486006,0.159411,-0.3338,0.768385,-1.206101,-1.712655,0.678415,-0.742046,0.264943,-1.457399,-0.640861,-0.044033,-0.514608,-0.84815,0.167455,-0.194368,0.142692,2.146227,0.057798,0.620338,-0.619535,-0.179334,0.279594,0.175806,-0.560744,-1.972514,0.767934,0.016061,1.103649,-1.284107,0.82331,-0.632047,-0.413833,-0.951172,-0.574102,0.294703,-0.072271,-0.007657,-0.003682,1.225809,0.092112,-0.99122,0.649668,-2.078787,-0.947774,7266.0,1475.0,8875.0,1736.0,20930.0,5241.0,7876.0,1444.0,864.0,4032.0,3513.0,11249.0,3522.0,2066.0,2833.0,9286.0,2877.0,3051.0,1693.0,1996.0,3270.0,1793.0,4597.0,2555.0,1840.0,1531.0,8794.0,1542.0,2525.0,52067.0,971.0,2206.0,4747.0,1878.0,1988.0,850.0,2751.0,1832.0,3588.0,1109.0,2297.0,2830.0,1778.0,3020.0,1555.0,1638.0,2376.0,1864.0,1070.0,8098.0,2016-06-18 11:02:34,412.0,732.0,4.0,245.0,9,1,AU,-27.0,133.0,-0.077783,2105.5,0.697907,1097.4,-0.561876,High,Medium,High,Low,Low,0.137675,0.763149,-0.484894,0.760987,0.186848,0.784784,-0.311682,0.992147,-0.18391,0.936259
645439,-1.312327,-1.348293,-1.901406,-1.744888,-1.79787,-1.150482,-1.273134,-1.916391,-2.215156,-1.206101,-1.712655,-1.778366,-2.479094,-1.337052,-1.457399,-1.397489,-1.598144,-1.261494,-0.84815,-0.590973,-0.942068,-2.515854,-0.992226,-2.633561,-1.092182,-2.269927,-1.074563,-2.488449,-1.555487,-3.365685,-1.972514,-1.397947,-2.896236,-1.301251,-1.284107,-1.301625,-2.423076,-0.413833,-0.163306,-0.574102,-2.32218,-0.973375,-2.777275,-0.923157,-2.86262,-0.815405,-3.004765,-1.753537,-3.054069,-1.928316,1103.0,792.0,544.0,99320.0,471.0,432.0,792.0,392.0,0.0,720.0,504.0,448.0,512.0,608.0,2351.0,4336.0,824.0,568.0,992.0,423.0,416.0,1008.0,936.0,584.0,912.0,449.0,928.0,609.0,480.0,0.0,1480.0,888.0,760.0,455.0,745.0,543.0,2161.0,655.0,671.0,1161.0,416.0,504.0,856.0,441.0,1000.0,2879.0,1095.0,937.0,2312.0,14606.0,2018-03-11 17:32:51,1920.0,1200.0,14.0,160.0,4,4,MY,1.5002,103.8273,0.448241,613.1,-0.958642,653.327,-0.601846,High,Low,Low,Low,Low,-1.586605,0.370869,-1.446082,0.517354,-1.893,0.865547,-1.3728,0.870676,-2.04147,0.896796
930399,-0.518667,0.16765,-0.244487,0.693181,0.563917,-0.332238,0.875684,1.248791,-0.723385,-1.206101,-0.218665,-0.959439,0.126478,0.264943,0.907314,-0.640861,0.733022,0.232279,-0.073486,0.167455,-0.942068,1.028874,-0.992226,0.057798,-0.235922,0.205662,-1.074563,-0.643087,0.175806,0.374237,1.45083,-1.397947,0.016061,-1.301251,1.864696,-1.301625,-0.632047,-1.293624,1.412424,0.385968,0.294703,1.729938,-0.930863,1.835269,-0.818406,0.092112,0.015552,-0.1514,-0.128222,-0.947774,9024.0,4415.0,2940.0,2841.0,4219.0,6699.0,6598.0,4034.0,3037.0,4498.0,3189.0,3872.0,1313.0,12242.0,3024.0,2325.0,2872.0,10294.0,2938.0,49214.0,6083.0,4185.0,25527.0,4342.0,3743.0,1836.0,3567.0,7917.0,6088.0,3430.0,2458.0,3825.0,21439.0,6370.0,5438.0,2769.0,5353.0,36822.0,1700.0,2874.0,3549.0,7776.0,3399.0,4145.0,30985.0,3506.0,3681.0,3227.0,4184.0,3003.0,2018-09-24 01:20:06,1366.0,768.0,41.0,377.0,30,1,SE,55.7028,13.1927,-0.212033,4565.7,0.358495,4453.9,0.528812,Medium,High,High,High,High,0.052434,0.784754,0.053904,0.566089,-0.204549,0.695436,-0.079651,1.294321,0.099091,0.990635
702625,-0.518667,-1.348293,0.583972,-1.744888,1.351179,-1.150482,0.159411,-1.125096,1.514271,0.336442,-0.96566,-0.140512,-0.742046,0.264943,-1.457399,-0.640861,-0.044033,-0.514608,-0.84815,-0.590973,-0.194368,0.142692,-0.207612,-0.839321,0.620338,-0.619535,0.715895,-0.643087,-0.68984,0.374237,0.594994,0.767934,0.016061,1.103649,0.290295,0.82331,0.263468,-1.293624,0.624559,0.385968,0.294703,-0.072271,-0.007657,-0.923157,0.203701,0.092112,0.015552,0.649668,0.84706,0.032768,6467.0,3419.0,2699.0,3285.0,1916.0,1117.0,7472.0,2856.0,3571.0,2768.0,3482.0,5882.0,2917.0,3357.0,3201.0,2582.0,3916.0,11356.0,2581.0,1886.0,1014.0,2932.0,2336.0,3567.0,4516.0,6853.0,3938.0,4167.0,2919.0,3670.0,10060.0,3339.0,2499.0,2951.0,3619.0,5055.0,5084.0,23927.0,3935.0,4584.0,2187.0,2955.0,1967.0,4019.0,3269.0,2834.0,2602.0,2683.0,2887.0,3231.0,2018-04-17 06:13:52,375.0,667.0,17.0,208.0,1504,5,US,32.602,-85.487,-0.361581,2059.15,-0.399102,3254.45,-0.505626,Low,Medium,Medium,High,Low,-0.194215,1.151913,-0.56793,0.496399,-0.13406,0.571219,0.357661,0.660543,0.113248,0.472226
465550,-1.312327,1.683593,1.412432,1.505871,1.351179,-1.150482,-1.273134,1.248791,0.0225,1.107713,-1.712655,1.497342,-0.742046,1.866938,-1.457399,-1.397489,-1.598144,-1.261494,-0.073486,-1.349401,-0.942068,-0.74349,-0.992226,-0.839321,2.332857,1.030858,0.715895,1.202275,-0.68984,1.309218,1.45083,-1.397947,0.986827,-1.301251,1.864696,-1.301625,1.158982,-1.293624,1.412424,1.346037,1.166997,-0.973375,0.915549,-0.923157,1.225809,-0.815405,1.022325,1.450737,0.84706,1.01331,3668.0,1960.0,2808.0,2554.0,2101.0,5081.0,5202.0,2134.0,2476.0,2218.0,3398.0,6549.0,2718.0,1993.0,3123.0,1983.0,4702.0,1852.0,8672.0,2486.0,1989.0,4938.0,2943.0,4390.0,2234.0,2809.0,3796.0,3288.0,4766.0,2080.0,1970.0,4218.0,4478.0,2602.0,2032.0,3347.0,3134.0,16672.0,2648.0,3592.0,3180.0,1778.0,2368.0,4938.0,2407.0,2000.0,3586.0,2602.0,2164.0,2022.0,2017-08-15 17:53:04,1920.0,1080.0,1061.0,173.0,5,3,US,38.0,-97.0,-0.424509,1874.9,-0.464889,2236.35,-0.54786,Low,Medium,Low,Medium,Low,0.459614,1.258925,-0.622783,1.307305,0.238416,1.211685,0.292535,1.40891,0.492985,0.979287
733035,1.068654,-1.348293,0.583972,-1.744888,1.351179,-1.150482,1.591957,-1.916391,1.514271,-1.977372,-0.218665,-0.959439,0.995003,0.264943,0.907314,0.872395,0.733022,0.979166,0.701178,-0.590973,1.301032,0.142692,1.361614,-0.839321,0.620338,-0.619535,0.715895,0.279594,0.175806,-0.560744,-0.260842,0.045974,0.986827,1.103649,-1.284107,-0.593313,-2.423076,1.34575,-0.951172,0.385968,1.166997,-0.973375,0.915549,-0.923157,0.203701,-0.815405,1.022325,1.450737,-0.128222,0.032768,3234.0,1201.0,1897.0,2790.0,2139.0,2804.0,3383.0,2475.0,2219.0,2078.0,1095.0,3758.0,773.0,1778.0,3379.0,3016.0,2385.0,5499.0,2264.0,1950.0,3225.0,1986.0,1456.0,1471.0,2625.0,1973.0,2865.0,3495.0,2595.0,2520.0,2242.0,2634.0,2032.0,1784.0,5363.0,3915.0,1507.0,2167.0,1615.0,3283.0,2145.0,3945.0,1950.0,2376.0,1448.0,2467.0,1958.0,1860.0,2460.0,1912.0,2018-05-08 01:45:11,1368.0,912.0,3.0,124.0,5,1,HU,47.5,19.0833,-0.494833,1296.6,-0.686746,1328.55,-0.668488,Low,Low,Low,Low,Low,-0.202739,1.545378,0.368394,0.714353,0.257737,0.76825,-0.164434,1.18924,0.195192,0.912413


In [None]:
trait_cols = ["EXT", "NEU", "AGR", "CON", "OPN"]  # Adjust if column names differ
print("Descriptive statistics for Big Five traits:")
df[trait_cols].describe().T

Descriptive statistics for Big Five traits:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
EXT,1011478.0,7.373227e-17,1.0,-0.583334,-0.370653,-0.24109,-0.024411,7.497549
NEU,1011478.0,2397.707,1874.574232,613.1,1414.0,1875.0,2635.05,13555.2105
AGR,1011478.0,7.912731000000001e-17,1.0,-0.958642,-0.510313,-0.269028,0.118021,6.173437
CON,1011478.0,2775.64,2176.262912,653.327,1610.3,2158.1,3080.65,15568.2565
OPN,1011478.0,-1.384728e-16,1.0,-0.941267,-0.511595,-0.274149,0.110331,6.069753


In [16]:
bin_cols = [f"{col}_bin" for col in trait_cols if f"{col}_bin" in df.columns]

for col in bin_cols:
    print(f"\nValue counts for {col}:")
    print(df[col].value_counts())



Value counts for EXT_bin:
EXT_bin
Low       337176
High      337157
Medium    337145
Name: count, dtype: int64

Value counts for NEU_bin:
NEU_bin
Low       337163
High      337160
Medium    337155
Name: count, dtype: int64

Value counts for AGR_bin:
AGR_bin
Low       337168
High      337155
Medium    337155
Name: count, dtype: int64

Value counts for CON_bin:
CON_bin
Low       337176
High      337153
Medium    337149
Name: count, dtype: int64

Value counts for OPN_bin:
OPN_bin
Low       337183
Medium    337150
High      337145
Name: count, dtype: int64


In [20]:
from sklearn.decomposition import PCA

# Select only numeric columns from your item features + engineered stats
numeric_item_cols = df[item_cols].select_dtypes(include=np.number).columns

# Apply PCA to retain 95% variance
pca = PCA(n_components=0.95, random_state=42)
pca_components = pca.fit_transform(df[numeric_item_cols])

# Results
print("✅ PCA applied on numeric item features")
print(f"Original numeric item features: {len(numeric_item_cols)}")
print(f"PCA components to retain 95% variance: {pca_components.shape[1]}")
print("Cumulative variance explained (95% target):", np.sum(pca.explained_variance_ratio_))


✅ PCA applied on numeric item features
Original numeric item features: 59
PCA components to retain 95% variance: 46
Cumulative variance explained (95% target): 0.95033122297687
