In [11]:
import pandas as pd
import re
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import os
import tkinter as tk
from tkinter import messagebox

# Download NLTK data if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\FATTANI
[nltk_data]     COMPUTERS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\FATTANI
[nltk_data]     COMPUTERS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
pip install tk


Defaulting to user installation because normal site-packages is not writeable
Collecting tk
  Downloading tk-0.1.0-py3-none-any.whl.metadata (693 bytes)
Downloading tk-0.1.0-py3-none-any.whl (3.9 kB)
Installing collected packages: tk
Successfully installed tk-0.1.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:

# Initialize NLTK components
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [5]:
def preprocessing(text):
    if not isinstance(text, str):
        text = ''
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    words = nltk.word_tokenize(text)  # Tokenize
    filtered_words = [word for word in words if word not in stop_words]  # Remove stop words
    stemmed_words = [stemmer.stem(word) for word in filtered_words]  # Stem each word
    clean_text = ' '.join(stemmed_words)
    return clean_text if clean_text else ' '

In [6]:

# Function to load data
def load_data(file_path, is_train=True):
    if is_train:
        data = pd.read_csv(file_path, delimiter=' ::: ', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
    else:
        data = pd.read_csv(file_path, delimiter=' ::: ', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])
    return data

In [7]:
# Paths for the model and vectorizer
model_path = 'logistic_regression_genre.pkl'
tfidf_path = 'tfidf_vectorizer.pkl'

In [None]:

# Check if the model and vectorizer files exist
if os.path.exists(model_path) and os.path.exists(tfidf_path):
    # Load the pre-trained model and TF-IDF vectorizer
    model = joblib.load(model_path)
    tfidf = joblib.load(tfidf_path)
    print("Loaded pre-trained model and vectorizer.")
else:
    # Load training data
    train_data = load_data('train_data.txt', is_train=True)

    # Clean and stem the plot descriptions
    train_data['clean_description'] = train_data['DESCRIPTION'].apply(preprocessing)
     # TF-IDF vectorization
    tfidf = TfidfVectorizer(stop_words='english')
    X = tfidf.fit_transform(train_data['clean_description'])
    y = train_data['GENRE']

    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Logistic Regression classifier
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Save the model and TF-IDF vectorizer
    joblib.dump(model, model_path)
    joblib.dump(tfidf, tfidf_path)
    print("Trained and saved the model and vectorizer.")

    # Predict on validation set to compute accuracy measures
    y_val_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, average='weighted')
    recall = recall_score(y_val, y_val_pred, average='weighted')
    f1 = f1_score(y_val, y_val_pred, average='weighted')
    report = classification_report(y_val, y_val_pred)

    print(f'Model Accuracy on Validation Set: {accuracy:.4f}')
    print(f'Precision on Validation Set: {precision:.4f}')
    print(f'Recall on Validation Set: {recall:.4f}')
    print(f'F1 Score on Validation Set: {f1:.4f}\n')

# Load test data
test_data = load_data('test_data.txt', is_train=False)

# Clean and stem the plot descriptions for test data
test_data['clean_description'] = test_data['DESCRIPTION'].apply(preprocessing)

# Transform test data using the TF-IDF vectorizer
X_test = tfidf.transform(test_data['clean_description'])

# Predict genres for test data
y_test_pred = model.predict(X_test)

# Function to predict genre
def predict_genre(user_query):
    clean_query = preprocessing(user_query)
    if clean_query is None:
        return None
    query_tfidf = tfidf.transform([clean_query])
    predicted_genre = model.predict(query_tfidf)
    return predicted_genre[0]

# Function to handle GUI interaction
def on_search():
    user_query = query_entry.get("1.0", tk.END).strip()
    if user_query:
        predicted_genre = predict_genre(user_query)
        result_label.config(text=f'Predicted Genre: {predicted_genre}')
    else:
        messagebox.showwarning("Input Error", "Please enter a valid query.")
# Create the main GUI window
root = tk.Tk()
root.title("Genre Prediction")

# Set the window size
root.geometry("500x300")

# Set the background color to black
root.configure(bg='black')

# Create and place the text box for user input
query_entry = tk.Text(root, height=10, width=50, bg='black', fg='white', insertbackground='white')
query_entry.pack(pady=10)

# Create and place the "Search" button
search_button = tk.Button(root, text="Search", command=on_search, bg='black', fg='white')
search_button.pack(pady=10)

# Create and place the label to display the result
result_label = tk.Label(root, text="Predicted Genre: ", font=("Helvetica", 14), bg='black', fg='white')
result_label.pack(pady=10)

# Run the GUI event loop
root.mainloop()


Loaded pre-trained model and vectorizer.
