In [1]:
import pandas as pd
import mca

In [2]:
def preprocess_movies(df):
    df['title'] = df['title'].str.extract(r'(.*)\s\((\d{4})\)', expand=True)[0].fillna(df['title'])

    # extract dummy variables for the genres
    one_hot_encoded = df['genres'].str.get_dummies('|').astype(bool)
    one_hot_encoded.columns = 'genres' + '_' + one_hot_encoded.columns
    df = pd.concat([df, one_hot_encoded], axis=1)
    df = df.drop(columns='genres')

    # apply mca on the boolean columns
    data = df.select_dtypes(include=bool)
    mca_obj = mca.MCA(data)
    mca_df = pd.DataFrame(mca_obj.fs_r(1))
    print(f'explained variance ratio of categorical columns: 1')
    df = pd.concat([df[['movieId', 'title']], mca_df], axis=1)

    return df

movies = pd.read_csv('data/movies.csv')
movie_features = preprocess_movies(movies)
movie_features.to_csv('data/movie_features.csv', index=False)
movie_features

explained variance ratio of categorical columns: 1


Unnamed: 0,movieId,title,0,1,2,3,4,5,6,7,...,9,10,11,12,13,14,15,16,17,18
0,1,Toy Story,0.039268,0.125361,-0.749132,-0.961044,0.096545,0.193138,-0.111254,0.272728,...,0.282569,-0.187634,-0.022400,-0.087072,-0.007855,-0.071696,-0.058506,0.065733,-0.003531,0.054916
1,2,Jumanji,0.039268,0.138841,-0.648570,-1.153977,0.318046,0.215654,-0.024189,0.352185,...,0.401672,-0.154782,-0.153387,-0.388624,-0.360566,-0.258208,-0.032363,-0.000749,-0.385688,-0.080618
2,3,Grumpier Old Men,0.039268,0.111973,-0.775127,0.528985,-0.572805,-0.107470,0.016346,-0.316577,...,0.161133,0.184314,-0.323776,-0.120591,0.280047,0.028525,0.010895,-0.003964,-0.038433,-0.002496
3,4,Waiting to Exhale,0.039268,0.119104,-0.512009,0.627566,-0.250823,-0.018064,0.113547,-0.088029,...,0.192651,0.070346,-0.067178,-0.040143,0.119258,0.015443,0.027146,-0.009561,0.002256,-0.010464
4,5,Father of the Bride Part II,0.039268,0.086949,-0.813359,0.198419,-0.749539,-0.134186,-0.242839,-0.599110,...,-0.155836,-0.057572,0.093625,0.197073,-0.236524,0.021729,0.069160,0.081727,0.050081,0.039669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,0.039268,0.129810,-0.463801,-0.717611,0.065694,0.143417,-0.088780,-0.022175,...,0.096700,-0.118705,-0.036235,-0.134324,0.032801,0.006917,-0.247704,0.215154,0.465347,-0.044365
9738,193583,No Game No Life: Zero,0.039268,0.118566,-0.758126,-0.780131,-0.107518,0.166745,-0.126953,0.227710,...,0.266754,-0.179004,-0.047302,-0.077422,0.000020,0.063636,-0.400865,0.358892,0.435827,0.124076
9739,193585,Flint,0.039268,0.133365,0.014227,0.824729,0.393142,0.160747,0.307949,0.369068,...,0.255686,-0.157590,0.446019,0.120754,-0.202322,-0.010722,0.059648,-0.020755,0.083632,-0.026398
9740,193587,Bungo Stray Dogs: Dead Apple,0.039268,0.143438,-0.283708,-1.035880,0.431726,0.263173,-0.107562,0.067183,...,-0.024899,-0.177030,0.125750,0.114205,0.705045,0.115588,-0.026410,0.016563,0.821619,-0.036462
