Generate datasets using local llm in ollama

In [None]:
import os
import pandas as pd
from langchain.llms import Ollama

model = os.environ.get("MODEL", "mistral")
callbacks = []

llm = Ollama(model=model, callbacks=callbacks)

csv_file_path = 'total_2793_dataset_processed.csv.csv'
output_file_path = 'processed_Datasets.csv'

chunk_size = 100

total_rows = sum(1 for row in pd.read_csv(csv_file_path)) - 1

processed_rows = 0
for chunk in pd.read_csv(csv_file_path, chunksize=chunk_size):
    results = []
    for index, row in chunk.iterrows():
        user_content = row['user_content']
        query = f"Generate only three similar questions based on the original context: {user_content}"
        answer = llm(query)
        assistant = row['assistant_content']
        is_liked = row['is_liked']
        results.append({
            "user_content": answer,
            "assistant_content": assistant,
            "is_liked": is_liked
        })
        print(answer, end=" ")

    results_df = pd.DataFrame(results)
    results_df.to_csv(output_file_path, mode='a', index=False, header=False)

    processed_rows += len(chunk)
    progress = (processed_rows / total_rows) * 100
    print(f"Progress: [{'#' * int(progress / 2)}{' ' * (50 - int(progress / 2))}] {progress:.2f}% complete", end='\r')

print(f"\nQuestion generation completed and saved to '{output_file_path}'.")


In [None]:
#use hosted ollama but may cause disturbance due to over traffic,
#huggingface

can be written either to generate or rewrite.

In [None]:

!pip install transformers pandas

import pandas as pd
from transformers import pipeline

file_path = 'total_2793_dataset_processed.csv'
dataset = pd.read_csv(file_path)

generator = pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", max_length=200)

def clean_text(text, generator):

    prompt = f"Rewrite the following text to remove unnecessary phrases and refine the content: {text}"
    response = generator(prompt, num_return_sequences=1)[0]['generated_text']


    cleaned_text = response.strip()

    return cleaned_text

for index in dataset.index:
    user_text = dataset.at[index, 'user']
    assistant_text = dataset.at[index, 'assistant']

    dataset.at[index, 'user'] = clean_text(user_text, generator)
    dataset.at[index, 'assistant'] = clean_text(assistant_text, generator)

cleaned_file_path = 'total_2793_dataset_cleaned.csv'
dataset.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to {cleaned_file_path}.")

filling null values by sentimenntal annalysis

In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer

file_path = 'merged_dataset.csv'
df = pd.read_csv(file_path)

sentiment_pipeline = pipeline('sentiment-analysis')
tokenizer = AutoTokenizer.from_pretrained(sentiment_pipeline.model.config._name_or_path)

def analyze_sentiment(content):
    tokens = tokenizer.encode(content, truncation=True, max_length=510)
    content_truncated = tokenizer.decode(tokens)

    result = sentiment_pipeline(content_truncated)[0]
    return 1 if result['label'] == 'POSITIVE' else 0

df['combined_content'] = df['user_content'] + ' ' + df['assistant_content']

df['sentiment_score'] = df['combined_content'].apply(analyze_sentiment)

df['is_liked'] = df.apply(lambda row: row['sentiment_score'] if pd.isnull(row['is_liked']) else row['is_liked'], axis=1)

df['is_liked'] = df['is_liked'].map({1: 'true', 0: 'false', 'true': 'true', 'false': 'false'})

df.drop(columns=['combined_content', 'sentiment_score'], inplace=True)

df.to_csv(file_path, index=False)

print("Sentiment analysis completed and 'is_liked' column updated for null values.")


text preprocessing: tokenization,removing stop words,ing words, lemmatization

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

file_path = 'total_2793_dataset - total_dataset.csv.csv'
dataset = pd.read_csv(file_path)

print(dataset.head())
print(dataset.columns)

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text= re.sub(r'\b\w+ing\b', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    processed_text = ' '.join(tokens)
    return processed_text

dataset['user'] = dataset['user'].apply(preprocess_text)
dataset['assistant'] = dataset['assistant'].apply(preprocess_text)

print(dataset[['user_processed', 'assistant_processed']].head())

processed_file_path = 'total_2793_dataset_processed.csv'
dataset.to_csv(processed_file_path, index=False)

print(f"Processed data saved to {processed_file_path}")


checking the rows with noise using huggign face text -classifier

In [None]:
import pandas as pd
from transformers import pipeline

file_path = 'total_2793_dataset_processed.csv'
dataset = pd.read_csv(file_path)

classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

def identify_noise(text, classifier):
    result = classifier(text)[0]
    return result['label'] == 'NEGATIVE' and result['score'] > 0.8

noisy_rows = []

for index, row in dataset.iterrows():
    user_text = row['user']
    assistant_text = row['assistant']

    user_noise = identify_noise(user_text, classifier)
    assistant_noise = identify_noise(assistant_text, classifier)

    if user_noise or assistant_noise:
        noisy_rows.append(index)

if noisy_rows:
    print(f"Noisy rows found at indices: {noisy_rows}")
else:
    print("No noise detected in the dataset.")


listing and removing words which holds no significant meaning using text classifier . removing also meaningless words in a row accordinng to the context


In [None]:
import pandas as pd
from transformers import pipeline

dataset = pd.read_csv('total_2793_dataset_processed.csv')

classifier = pipeline("text-classification", model="distilbert-base-uncased")

def remove_noise(text):
    predictions = classifier(text)
    return ' '.join([word for word, pred in zip(text.split(), predictions) if pred['label'] != 'LABEL_FOR_NOISE'])

dataset['user_cleaned'] = dataset['user'].apply(remove_noise)
dataset['assistant_cleaned'] = dataset['assistant'].apply(remove_noise)

dataset.to_csv('cleaned_dataset.csv', index=False)


In [None]:
import pandas as pd

df = pd.read_csv('cleaned_dataset.csv')

def remove_cleaned_words(text, cleaned_words):
    for word in str(cleaned_words).split():
        text = text.replace(word, '')
    return text.strip()

df['user'] = df.apply(lambda row: remove_cleaned_words(row['user'], row['user_cleaned']), axis=1)
df['assistant'] = df.apply(lambda row: remove_cleaned_words(row['assistant'], row['assistant_cleaned']), axis=1)

df.to_csv('your_file_cleaned.csv', index=False)

print("Processing completed and file saved as 'your_file_cleaned.csv'.")


cross checking noise

In [None]:
import pandas as pd
from transformers import pipeline

file_path = 'total_2793_dataset_processed.csv'
dataset = pd.read_csv(file_path)

classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

def identify_noise(text, classifier):
    result = classifier(text)[0]
    return result['label'] == 'NEGATIVE' and result['score'] > 0.8

noisy_rows = []

for index, row in dataset.iterrows():
    user_text = row['user']
    assistant_text = row['assistant']

    user_noise = identify_noise(user_text, classifier)
    assistant_noise = identify_noise(assistant_text, classifier)

    if user_noise or assistant_noise:
        noisy_rows.append(index)

if noisy_rows:
    print(f"Noisy rows found at indices: {noisy_rows}")
else:
    print("No noise detected in the dataset.")


removing only noise words not droping row

In [None]:
import pandas as pd
from transformers import pipeline

file_path = 'your_file_cleaned.csv'
dataset = pd.read_csv(file_path)

classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

def identify_noise(word, classifier):
    result = classifier(word)[0]
    return result['label'] == 'NEGATIVE' and result['score'] > 0.8

def clean_sentence(sentence, classifier):
    if not isinstance(sentence, str):
        return sentence

    words = sentence.split()
    cleaned_words = [word for word in words if not identify_noise(word, classifier)]
    return ' '.join(cleaned_words)

dataset['user'] = dataset['user'].apply(lambda x: clean_sentence(x, classifier))
dataset['assistant'] = dataset['assistant'].apply(lambda x: clean_sentence(x, classifier))

cleaned_file_path = 'your_file_cleaned_no_noise.csv'
dataset.to_csv(cleaned_file_path, index=False)

print(f"Dataset cleaned. Cleaned dataset saved to {cleaned_file_path}.")



In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('your_file_cleaned_no_noise.csv')

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Display dataset information (columns, data types, non-null counts)
print("\nDataset Info:")
print(df.info())

# Display summary statistics
print("\nSummary Statistics:")
print(df.describe(include='all'))

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


First 5 rows of the dataset:
                                                user  \
0       product inventory operation attribute reason   
1           attribute mef standard product inventory   
2  table product inventory attribute mef name des...   
3  id unique identifier organization customer tra...   
4           detail requirement product inventory api   

                                           assistant  is_liked is_duplicated  \
0  correct mention product inventory operation at...     False            no   
1  mef metro ethernet forum standard organization...     False            no   
2  table product inventory attribute mentioned me...     False            no   
3  previous complete table attribute table mef at...      True            no   
4  mef standard provides set requirement guidelin...     False            no   

  user_cleaned assistant_cleaned  
0      omitted         apologize  
1         main  introductionnthe  
2       create            sample  
3        buye

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('your_file_cleaned_no_noise.csv')

# Drop rows with missing values
df_cleaned = df.dropna()

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv('your_file_cleaned_no_missing1.csv', index=False)


In [1]:
!pip install transformers datasets torch




genneration

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments

df = pd.read_csv('your_file_cleaned_no_missing1.csv')
df.rename(columns={'user': 'input_text',
                   'assistant': 'target_text'}, inplace=True)
df.dropna(subset=['input_text', 'target_text'], inplace=True)

dataset = Dataset.from_pandas(df)
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

tokenizer = AutoTokenizer.from_pretrained('t5-small')
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length') # Add padding here
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length') # Add padding here
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()

metrics = trainer.evaluate()
print(metrics)

model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

question = "Can you explain the rationale behind leaving out the Product Inventory Operation Attributes?"
input_ids = tokenizer.encode(question, return_tensors='pt')
outputs = model.generate(input_ids)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Response:", response)

Map:   0%|          | 0/2214 [00:00<?, ? examples/s]



Map:   0%|          | 0/554 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss


if immediate result is expected : less time evaluation


Classification

In [None]:
!pip install transformers pandas scikit-learn datasets

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset

file_path = 'your_file_cleaned_no_noise.csv'
data = pd.read_csv(file_path)

print(data.head())
print(data.info())

aggregated_data = data.groupby(['assistant', 'is_liked'])['user'].apply(lambda x: ' '.join(x.astype(str))).reset_index()

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

def evaluate_model(data, classifier, label_column='is_liked', text_column='assistant'):
    data = data.dropna(subset=[label_column])
    texts = data[text_column].tolist()
    labels = data[label_column].apply(lambda x: 1 if x == 'True' else 0).tolist()

    predictions = []
    for text in texts:
        result = classifier(text)
        predictions.append(1 if result[0]['label'] == 'LABEL_1' else 0)

    accuracy = accuracy_score(labels, predictions)
    return accuracy

pretrained_accuracy = evaluate_model(aggregated_data, classifier, label_column='is_liked', text_column='assistant')
print(f"Pretrained Model Accuracy: {pretrained_accuracy}")

train_data, test_data = train_test_split(aggregated_data.dropna(subset=['is_liked']), test_size=0.2, random_state=42)

def train_custom_model(train_data, model_name="bert-base-uncased"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    train_texts = train_data['assistant'].tolist()
    train_labels = train_data['is_liked'].apply(lambda x: 1 if x == 'True' else 0).tolist()
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels})

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )

    trainer.train()
    return model

custom_model = train_custom_model(train_data)

custom_classifier = pipeline("text-classification", model=custom_model, tokenizer=tokenizer)
custom_model_accuracy = evaluate_model(test_data, custom_classifier, label_column='is_liked', text_column='assistant')
print(f"Custom Model Accuracy: {custom_model_accuracy}")

def minimum_data_for_accuracy(data, classifier, target_accuracy=0.8, label_column='is_liked', text_column='assistant'):
    for i in range(1, len(data)+1):
        subset = data.sample(n=i, random_state=42)
        accuracy = evaluate_model(subset, classifier, label_column=label_column, text_column=text_column)
        if accuracy >= target_accuracy:
            return i, accuracy
    return len(data), accuracy

min_data_needed, min_data_accuracy = minimum_data_for_accuracy(train_data, custom_classifier)

results = {
    'Pretrained Model Accuracy': pretrained_accuracy,
    'Custom Model Accuracy': custom_model_accuracy,
    'Minimum Data Needed for Target Accuracy': min_data_needed,
    'Achieved Accuracy with Minimum Data': min_data_accuracy
}

print(results)
