In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_paths = {
    'ChatGPT': '/content/drive/MyDrive/AI_Reviews/Chatgpt.csv',
    'Deepseek': '/content/drive/MyDrive/AI_Reviews/Deepseek.csv',
    'Gemini': '/content/drive/MyDrive/AI_Reviews/Gemini.csv',
    'Claude': '/content/drive/MyDrive/AI_Reviews/Claude.csv',
    'Siri': '/content/drive/MyDrive/AI_Reviews/Siri.csv',
    'Alexa': '/content/drive/MyDrive/AI_Reviews/Alexa.csv'
}

In [None]:
questions_keywords = {
    "Is it safe to use?": ["safe", "security", "privacy", "trust", "hack", "personal data", "information", "protect"],
    "How accurate or reliable are the results?": ["accurate", "accuracy", "reliable", "reliability", "correct", "wrong", "error", "mistake", "hallucination", "fact", "false"],
    "How much does it cost?": ["cost", "price", "expensive", "cheap", "subscription", "free", "money", "value", "pay", "purchase", "plan"],
    "Do I need technical expertise to use it?": ["technical", "easy to use", "simple", "complex", "difficult", "hard", "user-friendly", "intuitive", "learning curve", "complicated"],
    "What happens to my data?": ["data", "my information", "storage", "delete", "save", "collect", "ownership"],
    "Is it ethical and unbiased?": ["bias", "biased", "ethical", "unethical", "fair", "unfair", "racist", "sexist", "discriminat", "political"],
    "How fast are the responses?": ["fast", "slow", "speed", "quick", "lag", "delay", "wait", "loading", "timeout", "responsive"],
    "Is there a limit to usage?": ["limit", "cap", "quota", "restriction", "message limit", "can't use", "throttle", "premium", "paywall"],
    "How customizable is it?": ["customiz", "personaliz", "flexible", "control", "settings", "options", "tailor", "configure"]
}

In [None]:
positive_words = ["good", "great", "excellent", "awesome", "perfect", "love", "best", "fast", "accurate", "helpful", "secure", "safe", "easy", "intuitive", "reliable", "amazing", "fantastic", "wonderful", "quick", "responsive"]
negative_words = ["bad", "terrible", "poor", "slow", "wrong", "error", "bug", "crash", "hack", "expensive", "difficult", "complex", "unreliable", "useless", "broken", "worst", "horrible", "awful", "frustrating", "inaccurate"]

In [None]:
master_data_list = []

In [None]:
def calculate_custom_sentiment(text):
    """
    Calculates a custom sentiment score for a text.
    +1 for each positive word, -1 for each negative word.
    Score is then averaged over 10 words to normalize for length.
    """
    if not isinstance(text, str):
        return 0

    words = text.lower().split()
    total_words = len(words)
    if total_words == 0:
        return 0

    score = 0

    for word in words:
        if word in positive_words:
            score += 1
        elif word in negative_words:
            score -= 1


    normalized_score = (score / total_words) * 10
    return round(normalized_score, 2)


master_data_list = []


for service_name, file_path in file_paths.items():
    try:
        print(f"Processing {service_name}...")


        df = pd.read_csv(file_path, low_memory=False)

        text_column = None
        rating_column = None

        if service_name == 'ChatGPT':
            text_column = 'content'
            rating_column = 'score'
        elif service_name == 'Deepseek':
            text_column = 'content'
            rating_column = 'score'
        elif service_name == 'Gemini':
            text_column = 'content'
            rating_column = 'score'
        elif service_name == 'Claude':
            text_column = 'content'
            rating_column = 'score'
        elif service_name == 'Siri':
            text_column = 'reviewDescription'
            rating_column = 'ratingScore'

            df['reviewDescription'] = df['reviewDescription'].fillna('')
        elif service_name == 'Alexa':
            text_column = 'verified_reviews'
            rating_column = ' score'


        if text_column is None or rating_column is None or text_column not in df.columns or rating_column not in df.columns:
            print(f"Warning: Could not find required columns in {service_name}. Skipping.")
            print(f"Columns found: {df.columns.tolist()}")
            continue


        df['text_lower'] = df[text_column].astype(str).str.lower()


        df['custom_sentiment'] = df['text_lower'].apply(calculate_custom_sentiment)


        for question, keywords in questions_keywords.items():

            pattern = '|'.join(keywords)

            matched_reviews = df[df['text_lower'].str.contains(pattern, na=False)]


            num_matches = len(matched_reviews)


            sample_size = min(100, num_matches)
            if num_matches > 0:
                sampled_reviews = matched_reviews.sample(n=sample_size, random_state=42)
            else:

                continue


            for _, review in sampled_reviews.iterrows():
                new_row = {
                    'Service': service_name,
                    'Question': question,
                    'Primary Data Source': review[text_column],
                    'Secondary Data Source': review[rating_column],
                    'Custom Sentiment Score': review['custom_sentiment'],
                    'Analysis Type': 'Keyword/Text Analysis'
                }
                master_data_list.append(new_row)

        print(f"Finished processing: {service_name}")

    except Exception as e:
        print(f"An error occurred with {service_name}: {str(e)}")


Processing ChatGPT...
Finished processing: ChatGPT
Processing Deepseek...
Finished processing: Deepseek
Processing Gemini...
Finished processing: Gemini
Processing Claude...
Finished processing: Claude
Processing Siri...
Finished processing: Siri
Processing Alexa...
Finished processing: Alexa


In [None]:
final_master_df = pd.DataFrame(master_data_list)

In [None]:
print(final_master_df)

      Service                 Question  \
0     ChatGPT       Is it safe to use?   
1     ChatGPT       Is it safe to use?   
2     ChatGPT       Is it safe to use?   
3     ChatGPT       Is it safe to use?   
4     ChatGPT       Is it safe to use?   
...       ...                      ...   
4864    Alexa  How customizable is it?   
4865    Alexa  How customizable is it?   
4866    Alexa  How customizable is it?   
4867    Alexa  How customizable is it?   
4868    Alexa  How customizable is it?   

                                    Primary Data Source  \
0     can still follow the rules but sometimes we yo...   
1                  Best information gide,thaks chat GPT   
2                     nice app. need more informations.   
3     Very nice app give all information and also gi...   
4     I think will take so long to a great, interest...   
...                                                 ...   
4864  Pros- plenty loud for a large room with high c...   
4865  Love this product

In [None]:
output_path = '/content/drive/MyDrive/AI_Reviews/Filtered_AI_Reviews_Master.csv'
final_master_df.to_csv(output_path, index=False)

print("Processing complete!")
print(f"Final master CSV saved to: {output_path}")
print(f"Total number of rows in the new file: {len(final_master_df)}")

Processing complete!
Final master CSV saved to: /content/drive/MyDrive/AI_Reviews/Filtered_AI_Reviews_Master.csv
Total number of rows in the new file: 4869
