# Movies - PCA

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

In [2]:
df = pd.read_parquet("../data/movies-clean.parquet.gzip")
df.head()

Unnamed: 0,RATING,VOTES,RunTime,TYPE,Year_From,Year_To,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
0,6.1,21062,121.0,Movie,2021,2021,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17870,25.0,Series,2021,2021,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,8.2,885805,44.0,Series,2010,2022,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9.2,414849,23.0,Series,2013,2013,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5,7.6,25858,50.0,Series,2020,2020,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
selected_col = [col for col in df if col.startswith('Genre_')]
selected_col

['Genre_Action',
 'Genre_Adventure',
 'Genre_Animation',
 'Genre_Biography',
 'Genre_Comedy',
 'Genre_Crime',
 'Genre_Documentary',
 'Genre_Drama',
 'Genre_Family',
 'Genre_Fantasy',
 'Genre_Film-Noir',
 'Genre_Game-Show',
 'Genre_History',
 'Genre_Horror',
 'Genre_Music',
 'Genre_Musical',
 'Genre_Mystery',
 'Genre_News',
 'Genre_Reality-TV',
 'Genre_Romance',
 'Genre_Sci-Fi',
 'Genre_Short',
 'Genre_Sport',
 'Genre_Talk-Show',
 'Genre_Thriller',
 'Genre_War',
 'Genre_Western']

In [4]:
df_filtered = df[selected_col]
print(df_filtered.shape)
df_filtered.head()

(8168, 27)


Unnamed: 0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Documentary,Genre_Drama,Genre_Family,Genre_Fantasy,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
pca = PCA(n_components=26)
pca_data = pca.fit_transform(df_filtered)

df_pca_data = pd.DataFrame(data=pca_data,
             columns=list(range(pca.n_components_)))

df_pca_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.364103,-0.367398,0.297763,0.178659,0.948204,0.910227,-0.170047,-0.105102,0.088398,-0.373342,...,-0.077327,-0.009228,-0.058356,0.065379,0.004435,0.003012,-0.00481,0.000894,0.000114,0.001113
1,1.38254,-0.603162,-0.14544,-0.098225,-0.016969,-0.132875,0.054349,-0.026299,-0.100659,-0.037907,...,-0.008682,0.032151,0.02884,-0.025978,-0.000422,0.002671,-0.002195,-0.000287,-0.000866,-0.001114
2,-0.649869,-0.385,-0.019572,-0.263753,0.86004,0.28355,-0.381234,-0.352113,-0.129414,-0.16157,...,-0.132205,0.02475,-0.061633,0.068512,0.010575,0.013126,-0.0022,0.006008,0.003441,0.001932
3,0.977654,0.456952,-0.534468,0.068806,0.04979,-0.666287,-0.13207,-0.136038,-0.394284,0.075062,...,-0.110125,0.060034,0.000837,0.037234,0.007718,-0.001694,0.010094,0.002539,0.006258,0.000205
4,-0.265231,-0.926324,-0.108533,0.62675,-0.474409,0.344162,0.079063,-0.038783,0.297018,0.061389,...,0.021674,-0.015462,-0.009016,0.005494,-0.001199,0.004986,-0.003317,0.002697,-0.004171,0.00234


In [6]:
pca.explained_variance_ratio_

array([0.20110827, 0.15798846, 0.12125164, 0.07614156, 0.0567102 ,
       0.05092727, 0.04344286, 0.03846535, 0.03663341, 0.03192755,
       0.02834083, 0.02348521, 0.02202164, 0.01835747, 0.01712177,
       0.0150422 , 0.01278931, 0.01163855, 0.01076579, 0.00955862,
       0.00497599, 0.00342871, 0.00297522, 0.00154872, 0.00134593,
       0.00119804])

In [7]:
np.cumsum(pca.explained_variance_ratio_)

array([0.20110827, 0.35909674, 0.48034837, 0.55648994, 0.61320014,
       0.66412741, 0.70757027, 0.74603562, 0.78266903, 0.81459658,
       0.84293741, 0.86642262, 0.88844426, 0.90680172, 0.9239235 ,
       0.93896569, 0.95175501, 0.96339356, 0.97415934, 0.98371796,
       0.98869395, 0.99212266, 0.99509788, 0.99664659, 0.99799253,
       0.99919056])