In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict
from nltk.stem import WordNetLemmatizer
import copy

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')  # For POS tagging

# Load data
train_data = pd.read_csv('train.txt', delimiter=' ::: ', engine='python')
test_data = pd.read_csv('test.txt', delimiter=' ::: ', engine='python')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Qasem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Qasem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Qasem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Qasem\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Importing train data and
Preprocessing

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Preprocess data

preprocess_data = copy.deepcopy(train_data.head(1000))
preprocess_data['description'] = preprocess_data['description'].apply(preprocess_text)
data = preprocess_data


In [10]:
# from imblearn.under_sampling import RandomUnderSampler

# X_train = copy.deepcopy(data['description'])
# y_train = copy.deepcopy(data['genre'])

# # تبدیل داده‌ها به فرمت مناسب
# X_train = X_train.values.reshape(-1, 1)

# # ایجاد نمونه‌های کاهشی
# rus = RandomUnderSampler(random_state=42)
# X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# # تبدیل داده‌های کاهشی به DataFrame
# resampled_train_data = pd.DataFrame({
#     'description': X_resampled.flatten(),
#     'genre': y_resampled
# })

# des = []
# genre = []
# for i in range(len(X_resampled)):
#     des.append(X_resampled[i][0])
#     genre.append(y_resampled[i][0])

# n = pd.DataFrame({'description':des, 'genre':y_resampled})
# data = n

In [4]:

# Precompute IDF values
def compute_idf(corpus):
    idf_dict = defaultdict(int)
    for document in corpus:
        unique_words = set(document.split())
        for word in unique_words:
            idf_dict[word] += 1
    total_documents = len(corpus)
    idf_dict = {word: np.log10(total_documents / count) for word, count in idf_dict.items()}
    return idf_dict

# Compute TF-IDF
corpus = data['description'].tolist()

idf_dict = compute_idf(corpus)

def compute_tf(text):
    tokens = text.split()
    tf_text = Counter(tokens)
    tf_text = {word: count / len(tokens) for word, count in tf_text.items()}
    return tf_text

# tfidf_matrix = []
# for text in corpus:
#     tfidf_text = {}
#     computed_tf = compute_tf(text)
#     for word in computed_tf:
#         tfidf_text[word] = computed_tf[word] * idf_dict.get(word, 0)
#     tfidf_matrix.append(tfidf_text)

# # Convert the tfidf_matrix to a consistent format
# unique_words = set(word for doc in tfidf_matrix for word in doc.keys())
# word_index = {word: i for i, word in enumerate(unique_words)}
# tfidf_vectors = []
# for doc in tfidf_matrix:
#     vector = np.zeros(len(unique_words))
#     for word, tfidf in doc.items():
#         vector[word_index[word]] = tfidf
#     tfidf_vectors.append(vector)

# tfidf_vectors = np.array(tfidf_vectors)


['tf.pkl']

In [20]:
class knn_model:

    def euclidean_distance(self,a, b):
        return np.sqrt(np.sum((a - b) ** 2))

    def fit(self):
        tfidf_matrix = []
        for text in corpus:
            tfidf_text = {}
            computed_tf = compute_tf(text)
            for word in computed_tf:
                tfidf_text[word] = computed_tf[word] * idf_dict.get(word, 0)
            tfidf_matrix.append(tfidf_text)

        # Convert the tfidf_matrix to a consistent format
        unique_words = set(word for doc in tfidf_matrix for word in doc.keys())
        word_index = {word: i for i, word in enumerate(unique_words)}
        tfidf_vectors = []
        for doc in tfidf_matrix:
            vector = np.zeros(len(unique_words))
            for word, tfidf in doc.items():
                vector[word_index[word]] = tfidf
            tfidf_vectors.append(vector)

        self.tfidf_vectors = np.array(tfidf_vectors)

    # KNN predict function
    def knn_predict(self, x_test, k=5):
        distances = []
        for i in range(len(X_train)):
            dist = self.euclidean_distance(X_train[i], x_test)
            distances.append((dist, y_train.iloc[i]))
        
        distances.sort(key=lambda x: x[0])
        neighbors = distances[:k]
        output_values = [neighbor[1] for neighbor in neighbors]
        prediction = Counter(output_values).most_common(1)[0][0]
        return prediction


k = knn_model()
tfidf_vectors = k.fit()       
X_train, X_val, y_train, y_val = train_test_split(tfidf_vectors, data['genre'], test_size=0.3, random_state=42)


# Train-test split function
def train_test_split(X, y, test_size=0.2, random_state=42):
    np.random.seed(random_state)
    indices = np.random.permutation(len(X))
    test_size = int(len(X) * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    X_train = X[train_indices]
    X_val = X[test_indices]
    y_train = y.iloc[train_indices]
    y_val = y.iloc[test_indices]
    return X_train, X_val, y_train, y_val




In [22]:


# y_pred = [k.knn_predict(X_train, y_train, x) for x in X_val]
# joblib.dump(k)
# # Classification report function
# def classification_report(y_true, y_pred):
#     labels = list(set(y_true))
#     report = {}
#     for label in labels:
#         true_positive = sum((y_true == label) & (y_pred == label))
#         false_positive = sum((y_true != label) & (y_pred == label))
#         false_negative = sum((y_true == label) & (y_pred != label))
#         precision = true_positive / (true_positive + false_positive) if true_positive + false_positive > 0 else 0
#         recall = true_positive / (true_positive + false_negative) if true_positive + false_negative > 0 else 0
#         f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
#         report[label] = {
#             'precision': precision,
#             'recall': recall,
#             'f1-score': f1_score
#         }
#     return report

# print(classification_report(np.array(y_val), np.array(y_pred)))

In [23]:
# Preprocess and vectorize function
def preprocess_and_vectorize(description, corpus):
    description = preprocess_text(description)
    computed_tf = compute_tf(description)
    tfidf_vector = np.zeros(len(unique_words))
    for word in description.split():
        if word in word_index:
            tfidf_vector[word_index[word]] = computed_tf[word] * idf_dict.get(word, 0)
    return tfidf_vector

# Predict genre function
def predict_genre(description, X_train, y_train, corpus):
    description_vector = preprocess_and_vectorize(description, corpus)
    k.fit(X_train, y_train)
    predicted_genre = k.knn_predict( description_vector)
    return predicted_genre

# Test prediction
index = 5774
new_description = test_data['description'][index]
predicted_genre = predict_genre(new_description, X_train, y_train, corpus)
print(f"Predicted Genre: {predicted_genre}")
print(test_data['genre'][index])

NameError: name 'x_test' is not defined