<a href="https://colab.research.google.com/github/raghavlaad-89/AARSAAR/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import zipfile
import os

# Define the path to the zip file and the directory to extract to
zip_path = '/content/drive/MyDrive/Colab Notebooks/Problem Statement 2/dataset.zip'
extract_dir = '/content/dataset/'

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)


In [None]:
# Import necessary libraries
import pandas as pd
import json
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from textblob import TextBlob
from transformers import pipeline
import seaborn as sns
import matplotlib.pyplot as plt

# Load the datasets
def load_json_file(file_path):
    return pd.read_json(file_path, lines=True)

business_df = load_json_file('/content/dataset/dataset/yelp_academic_dataset_business.json')
checkin_df = load_json_file('/content/dataset/dataset/yelp_academic_dataset_checkin.json')
review_df = load_json_file('/content/dataset/dataset/yelp_academic_dataset_review.json')
tip_df = load_json_file('/content/dataset/dataset/yelp_academic_dataset_tip.json')
user_df = load_json_file('/content/dataset/dataset/yelp_academic_dataset_user.json')

In [None]:



# Data Preprocessing
def preprocess_data(review_df, business_df):
    # Merging the review and business datasets
    df = review_df.merge(business_df[['business_id', 'name', 'categories']], on='business_id', how='left')
    df['text'] = df['text'].apply(lambda x: x.lower())  # Convert reviews to lowercase
    return df[['business_id', 'name', 'categories', 'stars', 'text']]

df = preprocess_data(review_df, business_df)

# Load Spacy model for Aspect Extraction
nlp = spacy.load("en_core_web_sm")

# Aspect Extraction using noun phrases
def extract_aspects(review):
    doc = nlp(review)
    aspects = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]  # Only keep phrases with more than 1 word
    return aspects

df['aspects'] = df['text'].apply(extract_aspects)

# Sentiment Analysis using TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

df['sentiment'] = df['text'].apply(get_sentiment)

# Sentiment Analysis using BERT (Optional, requires transformers)
# Use BERT model for more accurate sentiment analysis
classifier = pipeline('sentiment-analysis')

def bert_sentiment_analysis(text):
    sentiment = classifier(text)[0]
    return sentiment['label'].lower()

df['bert_sentiment'] = df['text'].apply(bert_sentiment_analysis)

# Splitting dataset into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Evaluation Metrics for Aspect Extraction
def calculate_aspect_metrics(true_aspects, predicted_aspects):
    # Convert to sets for comparison
    true_set = set(true_aspects)
    pred_set = set(predicted_aspects)

    precision = len(true_set & pred_set) / len(pred_set) if len(pred_set) > 0 else 0
    recall = len(true_set & pred_set) / len(true_set) if len(true_set) > 0 else 0
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1

# Dummy labeled data (You would use real labeled data here)
true_aspects = [['food', 'service'], ['location', 'ambiance'], ['staff', 'cleanliness']]  # Example true aspects
pred_aspects = df['aspects'].tolist()[:3]  # Extracted aspects (first 3 as an example)

precision, recall, f1 = calculate_aspect_metrics(true_aspects[0], pred_aspects[0])
print(f"Aspect Extraction - Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Evaluation for Sentiment Classification (Using TextBlob)
y_true = test_df['stars'].apply(lambda x: 'positive' if x > 3 else 'negative' if x < 3 else 'neutral')
y_pred = test_df['sentiment']

accuracy = accuracy_score(y_true, y_pred)
precision_sentiment = precision_score(y_true, y_pred, average='weighted', labels=['positive', 'neutral', 'negative'])
recall_sentiment = recall_score(y_true, y_pred, average='weighted', labels=['positive', 'neutral', 'negative'])
f1_sentiment = f1_score(y_true, y_pred, average='weighted', labels=['positive', 'neutral', 'negative'])

print(f"Sentiment Classification - Accuracy: {accuracy:.2f}, Precision: {precision_sentiment:.2f}, Recall: {recall_sentiment:.2f}, F1 Score: {f1_sentiment:.2f}")

# Confusion Matrix for Sentiment Classification
conf_matrix = confusion_matrix(y_true, y_pred, labels=['positive', 'neutral', 'negative'])
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['positive', 'neutral', 'negative'], yticklabels=['positive', 'neutral', 'negative'])
plt.title('Confusion Matrix for Sentiment Analysis')
plt.xlabel('Predicted Sentiment')
plt.ylabel('True Sentiment')
plt.show()

# To evaluate the model on BERT sentiments
bert_y_pred = test_df['bert_sentiment']
bert_accuracy = accuracy_score(y_true, bert_y_pred)
bert_f1_sentiment = f1_score(y_true, bert_y_pred, average='weighted', labels=['positive', 'neutral', 'negative'])

print(f"BERT Sentiment Classification - Accuracy: {bert_accuracy:.2f}, F1 Score: {bert_f1_sentiment:.2f}")