In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

nltk.download('stopwords')

# Function to read training data
def read_train_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(':::')
            if len(parts) == 4:
                data.append({
                    'id': parts[0].strip(),
                    'text': parts[1].strip() + ' ' + parts[3].strip(),
                    'genre': parts[2].strip()
                })
    return data

# Function to read test data
def read_test_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(':::')
            if len(parts) == 3:
                data.append({
                    'id': parts[0].strip(),
                    'text': parts[1].strip() + ' ' + parts[2].strip()
                })
    return data

def preprocess_text(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

print("Loading and preprocessing data...")
train_data = read_train_data('/train_data.txt')
test_data = read_test_data('/test_data.txt')

X_train = [preprocess_text(item['text']) for item in train_data]
y_train = [item['genre'] for item in train_data]
X_test = [preprocess_text(item['text']) for item in test_data]
test_ids = [item['id'] for item in test_data]

print("Vectorizing text data...")
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Training model...")
model = LogisticRegression(max_iter=1000, multi_class='ovr', solver='liblinear')
model.fit(X_train_tfidf, y_train)

print("Making predictions...")
predictions = model.predict(X_test_tfidf)

print("Saving results...")
with open('/predictions.txt', 'w', encoding='utf-8') as f:
    for movie_id, genre in zip(test_ids, predictions):
        f.write(f"{movie_id} ::: {genre}\n")

print("Predictions saved to predictions.txt!")