In [1]:
import pandas as pd
import spacy
import nltk
from nltk import bigrams, trigrams
from nltk.corpus import stopwords
from collections import Counter



In [None]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Read the CSV file into a DataFrame
df = pd.read_csv('reviews.csv', sep=',')

# Define the list of places
places = ['Charminar', 'Golconda Fort', 'Wonderla', 'Ramoji Film City', 'Nehru Zoological Park',
          'Birla Science Museum', 'Hussain Sagar Lake', 'Birla mandir', 'Cable Bridge', 'NTR Garden']




In [None]:
# Define a set of adjectives to exclude
exclude_adjectives = {'good', 'great', 'nice', 'many', 'more', 'best', 'better', 'main', 'available', 'big', 'most', 'other', 'full', 'free', 'same', 'few', 'small'}

# Create dictionaries to store data for each place
data_per_place = []

# Iterate through each place and extract the data
for place in places:
    place_data = {}
    
    # Extract average rating and likely partner
    place_temp = df[df['Place You Want to Review About'] == place]
    partner_counts = place_temp['Who Was Your Company During Your Visit'].value_counts()
    max_partner = partner_counts.idxmax()
    place_data['Place'] = place
    place_data['Avg_Rating'] = place_temp['On a Scale of 1-5 Rate the Place'].mean()
    place_data['Likely_Partner'] = max_partner
    
    # Extract common entities
    exclude_words = {'Charminar', 'Hyderabad', 'India', 'Golconda', 'Telangana', 'West Bengal',
                     'Overall', 'hyderabad', 'Hyerabad', 'Hyderabad City',
                     'Ramoji Film City', 'Hyderbad', 'Telengana', 'Hussain Sagar Lake', 'Hussain',
                     'Bengaluru', 'Karnataka', 'Andhra', 'charminar', 'wonderla', 'Wonderla'}
    common_entities_per_place = Counter()
    
    place_reviews = df[df['Place You Want to Review About'] == place]
    for index, row in place_reviews.iterrows():
        review = row['A Detailed Review of the Place']
        doc = nlp(review)
        for ent in doc.ents:
            if ent.label_ == 'GPE' and ent.text not in exclude_words:
                common_entities_per_place[ent.text] += 1

    place_data['Common_Names'] = [entity for entity, _ in common_entities_per_place.most_common() if entity not in exclude_words][:5]
    
    # Extract common adjectives
    common_adjectives_per_place = Counter()
    
    for index, row in place_reviews.iterrows():
        review = row['A Detailed Review of the Place']
        doc = nlp(review)
        for token in doc:
            if token.pos_ == 'ADJ' and token.text.lower() not in exclude_adjectives:
                common_adjectives_per_place[token.text.lower()] += 1

    place_data['Common_Adjectives'] = [adjective for adjective, _ in common_adjectives_per_place.most_common()][:5]
    
    # Extract most frequent bigrams
    reviews_for_place = place_reviews['A Detailed Review of the Place']
    combined_text = ' '.join(reviews_for_place)
    words = nltk.word_tokenize(combined_text.lower())
    words = [word for word in words if word.isalnum() and word not in stopwords.words('english')]

    # extract most frequent bigrams
    bigram_list = list(bigrams(words))
    bigram_counter = nltk.FreqDist(bigram_list)
    place_data['Most_Frequent_Bigrams'] = [bigram for bigram, _ in bigram_counter.most_common(5)]
    
    # Extract most frequent trigrams
    trigram_list = list(trigrams(words))
    trigram_counter = nltk.FreqDist(trigram_list)
    place_data['Most_Frequent_Trigrams'] = [trigram for trigram, _ in trigram_counter.most_common(5)]

    # Append the data for this place to the list
    data_per_place.append(place_data)

# Create a DataFrame from the list of dictionaries
df_result = pd.DataFrame(data_per_place)

In [None]:
df_result.to_csv('data-analysis.csv', index=False)