In [1]:
import pandas as pd 
import lightgbm as lgb 
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

In [2]:
movies = pd.read_csv("/home/rafael/Projetos/classifier_synopsis/data/wiki_movie_plots_deduped_with_summaries.csv")

In [3]:
movies.head(3)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,PlotSummary
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",Carrie Nation and her followers burst into a s...
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","The film, just over a minute long, is composed..."


In [14]:
movies = movies[movies["Genre"] != 'unknown']

In [20]:
pd.DataFrame(movies["Genre"].unique())

Unnamed: 0,0
0,western
1,comedy
2,short
3,short action/crime western
4,short film
...,...
2259,sport film
2260,"animation, produced by glukoza production"
2261,"adventure, romance, fantasy film"
2262,ero


In [21]:
movies['Genre'].value_counts()

Genre
drama                                                 5964
comedy                                                4379
horror                                                1167
action                                                1098
thriller                                               966
                                                      ... 
action / historical                                      1
fantasy, drama, children's, sci-fi, adventure, spy       1
drama, action, military, spy                             1
drama, kaiju, mecha, space opera, sci-fi, action         1
historical, drama, comedy-drama, romance, youth          1
Name: count, Length: 2264, dtype: int64

In [25]:
top_genres = movies['Genre'].value_counts().head(20).index
top_genres

Index(['drama', 'comedy', 'horror', 'action', 'thriller', 'romance', 'western',
       'crime', 'adventure', 'musical', 'crime drama', 'romantic comedy',
       'science fiction', 'film noir', 'mystery', 'war', 'animation',
       'comedy, drama', 'sci-fi', 'family'],
      dtype='object', name='Genre')

In [26]:
movies = movies[movies['Genre'].isin(top_genres)]

In [27]:
movies.isna().any()

Release Year        False
Title               False
Origin/Ethnicity    False
Director            False
Cast                 True
Genre               False
Wiki Page           False
Plot                False
PlotSummary         False
label               False
dtype: bool

In [28]:
movies["PlotSummary"] = movies["PlotSummary"].str.lower().str.replace(r'[^a-z\s]','',regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["PlotSummary"] = movies["PlotSummary"].str.lower().str.replace(r'[^a-z\s]','',regex=True)


In [29]:
movies["PlotSummary"]

6        the film opens with two bandits breaking into ...
7        the film is about a family who move to the sub...
14       mr brown drinks several highball cocktails bef...
15       the plot is that of a black woman going to the...
16       a father and mother take their daughter dollie...
                               ...                        
34875    a small group of men search for a buried body ...
34876    the film opens with a senegalese boy named kha...
34882    two musicians salih and grkan described the ad...
34883    zafer a sailor living with his mother dnd in a...
34884    the film centres around a young woman who book...
Name: PlotSummary, Length: 20132, dtype: object

In [30]:
le = LabelEncoder()
movies['label'] = le.fit_transform(movies['Genre'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['label'] = le.fit_transform(movies['Genre'])


In [31]:
movies['label'].unique()

array([19,  3,  7,  1, 10,  5, 12,  6, 13, 14, 18, 17, 11,  4,  0,  9,  2,
       16, 15,  8])

In [32]:
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X = tfidf.fit_transform(movies['PlotSummary'])
y = movies["label"]

In [33]:
X 

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1348959 stored elements and shape (20132, 20000)>

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [37]:
params = {
    'objective': 'multiclass',
    'num_class': len(le.classes_),
    'metric': 'multi_logloss',
    'learning_rate': 0.1,
    'num_leaves': 63,
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

model = lgb.train(params, train_data, valid_sets=[valid_data], num_boost_round=300)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.208742 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 252425
[LightGBM] [Info] Number of data points in the train set: 16105, number of used features: 9139
[LightGBM] [Info] Start training from score -2.909238
[LightGBM] [Info] Start training from score -3.644252
[LightGBM] [Info] Start training from score -4.335027
[LightGBM] [Info] Start training from score -1.525510
[LightGBM] [Info] Start training from score -4.445138
[LightGBM] [Info] Start training from score -3.568788
[LightGBM] [Info] Start training from score -3.770683
[LightGBM] [Info] Start training from score -1.216574
[LightGBM] [Info] Start training from score -4.527830
[LightGBM] [Info] Start training from score -4.066484
[LightGBM] [Info] Start training from score -2.847409
[LightGBM] [Info] Start training from score -3.762629
[LightGBM] [Info] Start training from score -4.173456
[Ligh

In [38]:
y_pred = model.predict(X_test)
y_pred_labels = y_pred.argmax(axis=1)
print(classification_report(y_test, y_pred_labels, target_names=le.classes_))

                 precision    recall  f1-score   support

         action       0.46      0.26      0.33       220
      adventure       0.27      0.07      0.11       105
      animation       0.91      0.19      0.31        53
         comedy       0.39      0.54      0.45       876
  comedy, drama       0.00      0.00      0.00        47
          crime       0.31      0.04      0.06       114
    crime drama       0.00      0.00      0.00        93
          drama       0.40      0.73      0.52      1193
         family       0.00      0.00      0.00        43
      film noir       0.00      0.00      0.00        69
         horror       0.55      0.37      0.44       233
        musical       0.38      0.06      0.11        93
        mystery       0.50      0.13      0.21        62
        romance       0.50      0.14      0.21       185
romantic comedy       0.00      0.00      0.00        92
         sci-fi       0.25      0.02      0.04        44
science fiction       0.50    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
