In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.models import Model
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from tqdm import tqdm

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
df.head()

In [None]:
df.isna().sum()

In [None]:
df['Genre'] = df['listed_in'].apply(lambda x: x.split(',')[0])
df['Genre']

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df['Genre'])
plt.xticks(rotation=90)
plt.show()

In [None]:
genre = ['Dramas','Comedies','Documentaries','Action & Adventure','International TV Shows']
len(df[df.Genre.isin(genre)])/len(df)

In [None]:
df['len_desc'] = df['description'].apply(lambda x: len(x.split(' ')))
sns.displot(df['len_desc'])

In [None]:
data = df[df.Genre.isin(genre)][['Genre','description']]
data

In [None]:
X = data['description']
y = data['Genre']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['description'])
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded_seq = pad_sequences(train_sequences,maxlen=25,padding='post')
vocab_size = len(tokenizer.word_index)

In [None]:
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded_seq = pad_sequences(test_sequences,maxlen=25,padding='post')

In [None]:
le = LabelEncoder()
train_labels = le.fit_transform(y_train)
train_labels = to_categorical(train_labels)
test_labels = le.transform(y_test)
test_labels = to_categorical(test_labels)
le.classes_

In [None]:
inputs = Input(shape=(25,))
embedding = Embedding(vocab_size+1,50)(inputs)
lstm1 = LSTM(50,return_sequences=True)(embedding)
dropout1 = Dropout(0.2)(lstm1)
lstm2 = LSTM(50,return_sequences=True)(dropout1)
dropout2 = Dropout(0.2)(lstm2)
lstm3 = LSTM(50)(dropout2)
dropout3 = Dropout(0.2)(lstm3)
outputs = Dense(len(le.classes_),activation='softmax')(dropout3)
model = Model(inputs,outputs)
model.summary()

In [None]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(train_padded_seq,train_labels,epochs=100,batch_size=32,validation_split=0.1)

In [None]:
y_pred = [np.argmax(model.predict(test_padded_seq)[i]) for i in tqdm(range(len(test_padded_seq)))]

In [None]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix

confusion_matrix(np.argmax(y_test,-1),y_pred)