In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from transformers import BartTokenizer, TFBartForConditionalGeneration
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load data
df = pd.read_csv('your_data.csv')

# Label Encoding for 'Category'
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

# Text Preprocessing for 'Summary'
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

df['Summary'] = df['Summary'].apply(preprocess_text)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['Summary'], df['Category'], test_size=0.2, random_state=42)

# Tokenization using BART tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(16)

# Load the BART model
model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the model
history = model.fit(train_dataset, validation_data=test_dataset, epochs=3)

# Evaluate the model
loss, accuracy = model.evaluate(test_dataset)
print(f'Test Accuracy: {accuracy:.2f}')

# User input for prediction
user_input = "Your user input here"
preprocessed_input = preprocess_text(user_input)
input_encodings = tokenizer(preprocessed_input, return_tensors='tf', truncation=True, padding=True, max_length=512)
prediction = model.predict(input_encodings)
predicted_label = np.argmax(prediction.logits, axis=-1)

# Decode the label back to the category
final_prediction = label_encoder.inverse_transform(predicted_label)
print(f'Predicted Category: {final_prediction[0]}')


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BartTokenizer, TFBartForConditionalGeneration
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Load your CSV files
df = pd.read_csv('your_file.csv')

# Label encoding for the Category column
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

# NLTK preprocessing for the Summary column
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['Summary'] = df['Summary'].apply(preprocess_text)

# Load the BART tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Tokenize the Summary column
inputs = tokenizer(df['Summary'].tolist(), return_tensors='tf', padding=True, truncation=True)
labels = df['Category'].values

# Split the data into training and testing sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# Load the BART model
model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large', num_labels=len(label_encoder.classes_))

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss)

# Train the model
model.fit(train_inputs, train_labels, epochs=3, batch_size=8, validation_data=(test_inputs, test_labels))

# Evaluate the model
loss = model.evaluate(test_inputs, test_labels)
print(f'Test Loss: {loss}')

# Function to get user input and show model output
def get_model_output(user_input):
    # Preprocess the user input
    preprocessed_input = preprocess_text(user_input)

    # Tokenize the user input
    user_inputs = tokenizer(preprocessed_input, return_tensors='tf', padding=True, truncation=True)

    # Get the model prediction
    outputs = model.generate(user_inputs['input_ids'])
    predicted_category = label_encoder.inverse_transform([np.argmax(outputs.logits.numpy())])

    return predicted_category

# Example usage
user_input = input("Enter a summary: ")
predicted_category = get_model_output(user_input)
print(f'Predicted Category: {predicted_category}')


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load data
data = pd.read_csv('your_data.csv')

# Label Encoding for 'Category'
le = LabelEncoder()
data['Category'] = le.fit_transform(data['Category'])

# NLTK preprocessing for 'Summary'
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

data['Summary'] = data['Summary'].apply(preprocess_text)


import tensorflow as tf
from transformers import BartTokenizer, TFBartForConditionalGeneration
from transformers import DataCollatorForSeq2Seq
from datasets import Dataset

# Load pre-trained BART model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-base')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['Summary'], padding='max_length', truncation=True)

# Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Tokenize the data
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Prepare for training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
train_dataset = tokenized_dataset.shuffle(seed=42).select([i for i in list(range(1000))])  # Modify based on your data

# Define training arguments
training_args = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

# Compile and train the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5))
history = model.fit(
    train_dataset,
    epochs=3,
    batch_size=8,
    callbacks=[training_args]
)

# Evaluate model
# Assuming you have a test dataset prepared similarly
eval_results = model.evaluate(test_dataset)
print(f"Evaluation Results: {eval_results}")


def predict_summary(input_text):
    inputs = tokenizer(input_text, return_tensors='tf', truncation=True, padding=True)
    summary_ids = model.generate(inputs['input_ids'])
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary



In [None]:
# Example user input
user_input = input("Enter text for summarization: ")
print("Model Output:", predict_summary(user_input))