In [None]:
import pandas  as pd
import os

if not os.path.exists('./eda'):
    os.mkdir('./eda')

#  Path to the test data
TEST_PATH = '../data/Genre Classification Dataset/test_data_solution.txt'
# ID ::: TITLE ::: GENRE ::: DESCRIPTION
TRAIN_PATH = '../data/Genre Classification Dataset/train_data.txt'

# Read the data
train = pd.read_csv(TRAIN_PATH, sep=':::', names=[
    'id', 'title', 'genre', 'description'], engine='python')
test = pd.read_csv(TEST_PATH, sep=':::', names=[
    'id', 'title', 'genre', 'description'], engine='python')

In [None]:
# Check for missing values - visualize them appropriately - and handle them appropriately.
import matplotlib.pyplot as plt
import seaborn as sns

#  Visualize the missing values
sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.title('Missing values in train data')
plt.show()

In [None]:
# Look at the distribution of genres in the dataset. Are there any imbalanced classes?

#  Visualize the distribution of genres
sns.countplot(x='genre', data=train)

#  Rotate the xticks
plt.xticks(rotation=90)
plt.title('Distribution of genres')
plt.show()

Clearly some of the genres are more common than others. The most common genre is Drama, followed by Documentary, Comedy, Thriller, and Action. The least common genre is War, followed by History, and Western. The dataset is imbalanced, and we will need to take this into account when training our model.

In [None]:
# Create word clouds for the descriptions of movies for each genre to get a sense of the most common words used to describe movies in that genre.

#  Create a word cloud for each genre
from wordcloud import WordCloud

genres = train['genre'].unique()
for genre in genres:
    text = train[train['genre'] == genre]['description'].values
    wordcloud = WordCloud(width=800, height=400).generate(str(text))
    # Plot the word cloud with good visualization
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud)
    plt.title(f'Word cloud for {genre}')
    plt.axis('off')
    plt.savefig(f'./eda/wordcloud_{genre.strip()}.png')
    if genre == 'Drama':
        plt.show()
    plt.close()

In [None]:
# Create a word frequency distribution plot for the descriptions of movies for each genre.

#  Create a word frequency distribution plot for each genre - drop the stop words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    import nltk
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

for genre in genres:
    text = train[train['genre'] == genre]['description'].values
    word_tokens = word_tokenize(str(text))
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    fdist = FreqDist(filtered_sentence)
    fdist.plot(30, cumulative=False)
    plt.title(f'Word frequency distribution plot for {genre}')
    plt.savefig(f'./eda/word_frequency_distribution_{genre.strip()}.png')
    plt.close()

In [None]:
# Use a technique like TF-IDF to identify the most discriminative words for each genre.

#  Use TF-IDF to identify the most discriminative words for each genre
from sklearn.feature_extraction.text import TfidfVectorizer

print('Most discriminative words for each genre\n')
for genre in genres:
    text = train[train['genre'] == genre]['description'].values
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text)
    features = (vectorizer.get_feature_names_out())
    print(f'Genre: {genre}')
    for i in X.max(0).toarray()[0].argsort()[-3:][::-1]:
        print(features[i])
    print('')

In [None]:
# Use a visualization tool like a scatter plot matrix to identify correlations between different genres.

#  Use a scatter plot matrix to identify correlations between different genres
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train['description'])
features = (vectorizer.get_feature_names_out())
df = pd.DataFrame(X.toarray(), columns=features)
df['genre'] = train['genre']
sns.pairplot(df, hue='genre')
plt.title('Scatter plot matrix')
plt.savefig('./eda/scatter_plot_matrix.png')
plt.show()
plt.close()


In [None]:
# Create a few sample movie reviews in each genre to get a sense of the language patterns.

#  Create a few sample movie reviews in each genre randomly
import numpy as np
for genre in genres:
    text = train[train['genre'] == genre]['description'].values
    print(f'Genre: {genre}')
    # Shuffle the text
    np.random.shuffle(text)
    print(text[0])
    print(text[1])
    print(text[2])