In [5]:
import os

In [6]:
%pwd

'c:\\Users\\91909\\Documents\\Term 3\\Machine Learning\\assignment\\senti\\Sentiment-Analysis-App\\research'

In [7]:
os.chdir("../")

In [8]:
%pwd

'c:\\Users\\91909\\Documents\\Term 3\\Machine Learning\\assignment\\senti\\Sentiment-Analysis-App'

In [9]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    data_out_path: Path


In [10]:
from sentimentAnalysisApp.constants import *
from sentimentAnalysisApp.utils.common import read_yaml, create_directories

In [11]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        #self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            data_out_path=config.data_out_path,
        )

        return data_transformation_config


In [12]:
import os
from sentimentAnalysisApp.logging import logger
from transformers import BertTokenizer

# Assuming you've saved the tokenizer to a path specified by `save_path`
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


[2023-08-07 06:02:12,075: INFO: utils: NumExpr defaulting to 8 threads.]


#### Let's check if the conversion works

In [13]:
import pandas as pd

# Define the necessary variables and the sample dataframe again
df = pd.read_csv('data/McDonald_s_Reviews.csv')[:1]

aspects = ['price', 'anecdotes', 'food', 'ambience', 'service']
sentiments = ['positive', 'neutral', 'negative', 'conflict', 'none']

combinations = [f"{aspect} - {sentiment}" for aspect in aspects for sentiment in sentiments]

# Step 1: Add a column with all combinations for each row
df['combinations'] = [combinations] * len(df)

# Step 2: Explode the combinations into separate rows
df = df.explode('combinations')

# Reset the index for cleanliness
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,combinations
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,price - positive
1,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,price - neutral
2,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,price - negative
3,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,price - conflict
4,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,price - none
5,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,anecdotes - positive
6,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,anecdotes - neutral
7,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,anecdotes - negative
8,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,anecdotes - conflict
9,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,anecdotes - none


#### Yes it does. 25 new columns were added to the dataset. all the combinations covered. Let's add the functionality to the the pipeline.

In [14]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def add_aspects_based_sentiments(self,n):
        
        df = pd.read_csv(self.config.data_path)[:n]
        # Aspects and sentiments
        aspects = ['price', 'anecdotes', 'food', 'ambience', 'service']
        sentiments = ['positive', 'neutral', 'negative', 'conflict', 'none']

        combinations = [f"{aspect} - {sentiment}" for aspect in aspects for sentiment in sentiments]

        # Step 1: Add a column with all combinations for each row
        df['combinations'] = [combinations] * len(df)

        # Step 2: Explode the combinations into separate rows
        df = df.explode('combinations')

        # Reset the index for cleanliness
        df.reset_index(drop=True, inplace=True)
        df.to_csv(self.config.data_out_path,index=False)
        

In [15]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.add_aspects_based_sentiments(1)
except Exception as e:
    raise e

[2023-08-07 06:02:17,623: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-08-07 06:02:17,632: INFO: common: created directory at: artifacts]
[2023-08-07 06:02:17,634: INFO: common: created directory at: artifacts/data_transformation]


In [17]:
import pandas as pd
df = pd.read_csv('artifacts/McDonald_s_Reviews_ABS.csv')

In [18]:
df.shape

(25, 11)

In [19]:
df.combinations.value_counts()

combinations
price - positive        1
food - conflict         1
service - conflict      1
service - negative      1
service - neutral       1
service - positive      1
ambience - none         1
ambience - conflict     1
ambience - negative     1
ambience - neutral      1
ambience - positive     1
food - none             1
food - negative         1
price - neutral         1
food - neutral          1
food - positive         1
anecdotes - none        1
anecdotes - conflict    1
anecdotes - negative    1
anecdotes - neutral     1
anecdotes - positive    1
price - none            1
price - conflict        1
price - negative        1
service - none          1
Name: count, dtype: int64