<a href="https://colab.research.google.com/github/ruwanwija/Research-Models/blob/main/Industry_research_sample_dataset_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
df = pd.read_csv('Sample dataset II.csv', engine='python', on_bad_lines='skip')

In [None]:
df.head()

Unnamed: 0,reviews.text
0,Pleasant 10 min walk along the sea front to th...
1,Really lovely hotel. Stayed on the very top fl...
2,Ett mycket bra hotell. Det som drog ner betyge...
3,We stayed here for four nights in October. The...
4,We stayed here for four nights in October. The...


#Preprocess


In [None]:
def preprocess_text(text):
    # Check if the text is a string before processing
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if not word in stopwords.words('english')]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        text = ' '.join(tokens)
        return text
    else:
        # Handle non-string values (e.g., return an empty string or a placeholder)
        return ''

df['preprocessed_text'] = df['reviews.text'].apply(preprocess_text)

#Label


In [None]:
def label_review(text):
    labels = []
    if any(word in text for word in ['room', 'service', 'housekeeping']):
        labels.append('Room Services')
    if any(word in text for word in ['location', 'place', 'area', 'neighborhood']):
        labels.append('Location')
    if any(word in text for word in ['food', 'meal', 'breakfast', 'dinner', 'lunch', 'taste', 'delicious']):
        labels.append('Food Quality')
    if any(word in text for word in ['value', 'price', 'worth', 'cost', 'affordable']):
        labels.append('Value for Money')
    if any(word in text for word in ['comfort', 'comfortable', 'bed', 'sleep', 'relax']):
        labels.append('Comfort')
    if any(word in text for word in ['staff', 'employee', 'service', 'friendly', 'helpful']):
        labels.append('Staff Behavior')
    return labels

df['labels'] = df['preprocessed_text'].apply(label_review)

In [None]:
df.to_csv('labeled_dataset.csv', index=False)

In [None]:
df = df[df['labels'].str.len() > 0]

In [None]:
df.to_csv('labeled_dataset_no_unlabeled.csv', index=False)

In [None]:
 df = pd.read_csv('labeled_dataset_no_unlabeled.csv')

In [None]:
filtered_df = df[df['labels'].apply(lambda x: len(x.split(',')) == 1)] # Keep rows with label length less than 3


In [None]:
filtered_df.to_csv('single_label_dataset.csv', index=False)

#Download Labeled Dataset

In [None]:
df = pd.read_csv('labeled_dataset_no_unlabeled.csv')

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text'], y, test_size=0.3, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

train_df = pd.DataFrame({'text': X_train, 'labels': mlb.inverse_transform(y_train)})
val_df = pd.DataFrame({'text': X_val, 'labels': mlb.inverse_transform(y_val)})
test_df = pd.DataFrame({'text': X_test, 'labels': mlb.inverse_transform(y_test)})

train_df.to_csv('train_dataset.csv', index=False)
val_df.to_csv('val_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)

#Build the model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

# Load the training and testing data
train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer to the training data and transform
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

# Use MultiLabelBinarizer to handle multi-label data
mlb = MultiLabelBinarizer()

# Fit the binarizer to the training labels and transform
y_train = mlb.fit_transform(train_df['labels'].str.split(','))
y_test = mlb.transform(test_df['labels'].str.split(','))

# Train an SVM classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')



Accuracy: 0.5866613418530351
F1 Score: 0.9740778024283532


In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from tqdm import tqdm
from google.colab import files

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')



In [None]:
# If using Google Colab and GPU is available, move model to GPU for faster processing
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
# Function to get embeddings for a list of sentences in batches
def get_sentence_embeddings(sentences, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Generating embeddings"):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to the same device as the model
        with torch.no_grad():
            outputs = model(**inputs)
        # Get the embeddings from the [CLS] token, which is the first token (pooled output)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [None]:
# Function to calculate similarity between two sets of embeddings
def calculate_similarity(embeddings1, embeddings2):
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)
    return similarity_matrix

In [None]:
# Label dataset with 6 labels
label_dataset = [
    "Room Services",
    "Location",
    "Food Quality",
    "Value for Money",
    "Comfort",
    "Staff Behavior"
]

In [None]:
# Load test dataset (replace 'Sample_dataset II.csv' with your actual file path)
test_dataset = pd.read_csv('Sample dataset II.csv')

In [None]:
# Extract the test sentences column
test_sentences = test_dataset['reviews.text'].tolist()

In [None]:
# Get embeddings for labels and test sentences
print("Generating embeddings for labels...")
label_embeddings = get_sentence_embeddings(label_dataset, batch_size=6)  # Small batch for labels

Generating embeddings for labels...


Generating embeddings: 100%|██████████| 1/1 [00:00<00:00, 16.20it/s]


In [None]:
print("Generating embeddings for test sentences...")
test_embeddings = get_sentence_embeddings(test_sentences, batch_size=32)  # Larger batch for test data

Generating embeddings for test sentences...


Generating embeddings: 100%|██████████| 32/32 [00:56<00:00,  1.76s/it]


In [None]:
# Compute cosine similarities
print("Calculating cosine similarities...")
similarity_matrix = calculate_similarity(test_embeddings, label_embeddings)

Calculating cosine similarities...


In [None]:
# For each test sentence, find the label with the highest similarity
highest_indices = similarity_matrix.argmax(axis=1)
highest_factors = [label_dataset[idx] for idx in highest_indices]

In [None]:
# Convert the similarity matrix to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, columns=label_dataset)

In [None]:
# Add the highest factor for each test sentence
similarity_df['Highest Factor'] = highest_factors

In [None]:
# Optionally, add the original test sentences to the DataFrame
similarity_df['Test Sentence'] = test_sentences

In [None]:
# Reorder columns to have Test Sentence first
cols = ['Test Sentence'] + label_dataset + ['Highest Factor']
similarity_df = similarity_df[cols]

In [None]:
# Display the DataFrame
print(similarity_df.head())

                                       Test Sentence  Room Services  Location  \
0  Pleasant 10 min walk along the sea front to th...       0.715450  0.712518   
1  Really lovely hotel. Stayed on the very top fl...       0.689907  0.669377   
2  Ett mycket bra hotell. Det som drog ner betyge...       0.634343  0.653558   
3  We stayed here for four nights in October. The...       0.731409  0.708754   
4  We stayed here for four nights in October. The...       0.731409  0.708754   

   Food Quality  Value for Money   Comfort  Staff Behavior   Highest Factor  
0      0.742478         0.726528  0.727675        0.718423     Food Quality  
1      0.707569         0.664785  0.662232        0.673135     Food Quality  
2      0.645573         0.653573  0.645351        0.648793  Value for Money  
3      0.741548         0.695588  0.718235        0.697430     Food Quality  
4      0.741548         0.695588  0.718235        0.697430     Food Quality  


In [None]:
# Save the DataFrame to CSV
output_csv = 'similarity_results_with_highest_factors.csv'
similarity_df.to_csv(output_csv, index=False)

# If using Google Colab, download the CSV file
files.download(output_csv)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [None]:
df = pd.read_csv('Sample dataset II.csv')

In [None]:
labels = [
    'Room Services', 'Location', 'Food Quality', 'Value for Money', 'Comfort', 'Staff Behavior'
]

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['reviews.text'])

In [None]:
y = np.random.choice(labels, size=len(df))

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)

In [None]:
probabilities = model.predict_proba(X_test)

In [None]:
highest_prob_indices = np.argmax(probabilities, axis=1)
predicted_labels = le.inverse_transform(highest_prob_indices)

In [None]:
true_labels = le.inverse_transform(y_test)
accuracy = accuracy_score(true_labels, predicted_labels)

# 2024-10-21

In [None]:
pip install scikit-learn pandas numpy nltk



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
# Download the 'punkt' resource
nltk.download('punkt') # This line is added to download the 'punkt' resource.

# Load your dataset
data = pd.read_csv('Sample dataset II.csv')  # Ensure your dataset has 'review' and 'label' columns
reviews = data['reviews.text']

# Check the actual column names in your DataFrame
print(data.columns)

# Assuming your 6 classes are in separate columns in your dataset
# Replace with the actual column names from your dataset
labels = data[['label']]  # Your 6 classes, updated with likely column names

# Preprocess reviews
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

data['processed_review'] = data['reviews.text'].apply(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Index(['reviews.text', 'label'], dtype='object')


In [None]:
print("Unique labels:", data['label'].unique())

Unique labels: [nan]


In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)  # Numerical encoding of labels

  y = column_or_1d(y, warn=True)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['processed_review'], y, test_size=0.2, random_state=42)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
svm_model = SVC(kernel='linear', probability=True)  # Use a linear kernel for text classification
clf = CalibratedClassifierCV(svm_model)  # Calibrating for probability estimates
clf.fit(X_train_tfidf, y_train)

ValueError: The number of classes has to be greater than one; got 1 class

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load your dataset
data = pd.read_csv('Sample dataset II.csv')
reviews = data['reviews.text']

# Assuming your labels are in a single column separated by commas or some other delimiter
# Replace with the actual column name and delimiter from your dataset
labels = data['label']  # Adjust delimiter if needed

# Preprocess reviews (same as before)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

data['processed_review'] = data['reviews.text'].apply(preprocess)

# Convert labels to multi-label format
# Assuming 'labels' is a Series or array-like
# If 'labels' contains NaNs, drop them first:
labels = labels.dropna()
# If 'labels' are not in the correct format, convert them:
# Example: If labels are strings, split them into lists
if isinstance(labels.iloc[0], str): # Check if elements are strings
    labels = labels.str.split(',') # Split by your delimiter, e.g., ','
# Convert labels to list of lists format required by MultiLabelBinarizer
y = [[label] for label in labels] # Now pass this modified list of lists to fit_transform

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)  # Pass the list of lists to fit_transform

# Split data
X_train, X_test, y_train, y_test = train_test_split(data['processed_review'], y, test_size=0.2, random_state=42)

# Create TF-IDF vectors
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Build and train the model (OneVsRest with calibrated SVC)
classifier = OneVsRestClassifier(CalibratedClassifierCV(SVC()))
classifier.fit(X_train_tfidf, y_train)

# Predict probabilities
y_pred_probs = classifier.predict_proba(X_test_tfidf)

# Get predicted labels (highest probability for each sample)
y_pred = []
for probs in y_pred_probs:
    predicted_labels = [mlb.classes_[i] for i, prob in enumerate(probs) if prob == max(probs)]
    y_pred.append(predicted_labels[0] if predicted_labels else None)  # Handle empty list (no max)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


IndexError: single positional indexer is out-of-bounds