In [110]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

In [111]:
df = pd.read_csv('train_data.txt', sep=' ::: ', engine='python', names=['Title', 'Genre', 'Description'], nrows=6000)

df.head()

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [112]:
df['Genre'].value_counts()

drama          1481
documentary    1459
comedy          830
short           559
horror          235
thriller        211
action          146
western         116
reality-tv      100
family           82
adventure        82
sci-fi           79
music            75
adult            73
romance          58
animation        54
sport            51
crime            48
talk-show        41
mystery          37
biography        36
fantasy          35
musical          29
history          27
game-show        25
news             18
war              13
Name: Genre, dtype: int64

In [113]:
df = df[
    (df['Genre'] == 'drama') | (df['Genre'] == 'music') | (df['Genre'] == 'documentary') | (df['Genre'] == 'western')]

In [114]:
vec = TfidfVectorizer(stop_words='english')

In [115]:
matrix = vec.fit_transform(df['Description'])

In [116]:
X = matrix.toarray()

In [117]:
len(vec.get_feature_names_out())

27288

In [118]:
vec.get_feature_names_out()

array(['00', '000', '000km', ..., 'żestán', 'żo', 'żte'], dtype=object)

In [119]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

In [120]:
fig = px.scatter(x=X_pca[:, 0], y=X_pca[:, 1], color=df['Genre'], hover_name=df['Title'], template='plotly_dark')
fig.update_layout(
    title="2 Component PCA visualization of Movie Genres",
    xaxis_title="1st Principal Component",
    yaxis_title="2nd Principal Component",
)
fig.show()

In [121]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [122]:
fig = px.scatter_3d(x=X_pca[:, 0], y=X_pca[:, 1], z=X_pca[:, 2], color=df['Genre'], opacity=0.8,
                    title="3 Component PCA visualization of Movie Genres", hover_name=df['Title'], template='plotly_dark')
fig.show()