In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import re
import joblib
import copy
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.WordNetLemmatizer()

# بارگذاری داده‌ها
data = pd.read_csv('train.txt', delimiter=' ::: ', engine='python')
test_data = pd.read_csv('test.txt', delimiter=' ::: ', engine='python')


In [6]:

d = test_data
# Preprocessing function
def preprocess_text(text):
  
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  tokens = word_tokenize(text)

  # Optionally experiment with stemming instead (e.g., PorterStemmer)
  filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
  return ' '.join(filtered_tokens)


data['description'] = data['description'].apply(preprocess_text)


In [16]:
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.over_sampling import RandomOverSampler
# X_train = data['description']
# y_train = data['genre']

# X_train = X_train.values.reshape(-1, 1)

# ros = RandomOverSampler(random_state=42)
# X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# resampled_train_data = pd.DataFrame({
#     'description': X_resampled.flatten(),
#     'genre': y_resampled
# })

# des = []
# genre = []
# for i in range(len(X_resampled)):
#     des.append(X_resampled[i][0])
#     genre.append(y_resampled[i][0])

# n = pd.DataFrame({'description':des, 'genre':y_resampled})
# data = n

In [7]:

data = pd.concat([d,data], ignore_index = True)
# تبدیل متون به بردارهای ویژگی با استفاده از TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X = vectorizer.fit_transform(data['description'])
y = data['genre']

# تقسیم داده‌ها به مجموعه‌های آموزشی و ارزیابی
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# آموزش مدل Naive Bayes
model = MultinomialNB()
model.fit(X_train_resampled, y_train_resampled)

# ارزیابی مدل
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

# ذخیره مدل و بردار ساز برای استفاده در آینده
joblib.dump(model, 'knn_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


              precision    recall  f1-score   support

      action       0.30      0.48      0.37       489
   adventure       0.27      0.29      0.28       312
   animation       0.19      0.24      0.21       186
   biography       0.03      0.08      0.05        98
      comedy       0.56      0.51      0.53      3011
       crime       0.16      0.21      0.18       217
 documentary       0.75      0.64      0.69      5276
       drama       0.65      0.49      0.56      5415
      family       0.18      0.21      0.19       307
     fantasy       0.20      0.20      0.20       135
     history       0.09      0.15      0.12       109
      horror       0.55      0.67      0.60       905
       music       0.34      0.73      0.46       296
     musical       0.13      0.19      0.16        94
     mystery       0.10      0.10      0.10       135
  reality-tv       0.29      0.47      0.36       342
     romance       0.11      0.31      0.17       268
      sci-fi       0.32    

['tfidf_vectorizer.pkl']

In [19]:


# بارگذاری مدل و بردار ساز
model = joblib.load('knn_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')


In [20]:
def predict_genre(description):
    # تبدیل توضیح جدید به بردار ویژگی
    X_new = vectorizer.transform([description])
    
    # پیش‌بینی ژانر
    predicted_genre = model.predict(X_new)
    
    return predicted_genre[0]

n = 0
for i in range(2000):
    new_description = test_data['description'][i]
    predicted_genre = predict_genre(new_description)
    print(f'The predicted genre is: {predicted_genre}',test_data['genre'][i])
    if predicted_genre == test_data['genre'][i]:
        n+=1
    i+=1
print(n)





The predicted genre is: drama comedy
The predicted genre is: adventure short
The predicted genre is: drama drama
The predicted genre is: crime crime
The predicted genre is: short short
The predicted genre is: western western
The predicted genre is: drama drama
The predicted genre is: drama drama
The predicted genre is: drama drama
The predicted genre is: short short
The predicted genre is: thriller drama
The predicted genre is: thriller thriller
The predicted genre is: drama drama
The predicted genre is: sci-fi adventure
The predicted genre is: drama comedy
The predicted genre is: thriller thriller
The predicted genre is: drama comedy
The predicted genre is: documentary drama
The predicted genre is: documentary drama
The predicted genre is: animation animation
The predicted genre is: documentary adventure
The predicted genre is: sci-fi comedy
The predicted genre is: drama drama
The predicted genre is: documentary documentary
The predicted genre is: documentary documentary
The predicted

In [10]:
def predict_and_report(csv_file):
   
    model = joblib.load('knn_model.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')

    new_data = pd.read_csv(csv_file, delimiter= ' ::: ',engine='python')
    new_data['description'] = new_data['description'].apply(preprocess_text)


    X_new = vectorizer.transform(new_data['description'])
    y_true = new_data['genre']

    
    y_pred = model.predict(X_new)

    
    print(classification_report(y_true, y_pred))

predict_and_report('test.txt')

              precision    recall  f1-score   support

      action       0.46      0.63      0.53       197
   adventure       0.47      0.52      0.49       116
   animation       0.42      0.64      0.51        75
   biography       0.16      0.40      0.23        40
      comedy       0.65      0.59      0.62      1117
       crime       0.33      0.53      0.41        76
 documentary       0.79      0.65      0.71      1964
       drama       0.73      0.52      0.61      2042
      family       0.36      0.51      0.42       117
     fantasy       0.44      0.54      0.49        48
     history       0.29      0.61      0.39        36
      horror       0.61      0.71      0.66       331
       music       0.37      0.75      0.49       110
     musical       0.36      0.59      0.44        41
     mystery       0.26      0.38      0.31        48
  reality-tv       0.40      0.71      0.51       133
     romance       0.24      0.69      0.36       101
      sci-fi       0.44    