In [8]:
import json
import re
from collections import defaultdict, Counter
import pandas as pd

In [2]:
# Load the original JSON data
with open('gg2013.json', 'r') as file:
    tweets = json.load(file)#[:170000]

# Function to identify award patterns dynamically
def find_award_patterns(tweets):
    award_patterns = []
    for tweet in tweets:
        text = tweet['text'].lower()
        matches = re.findall(r'best [\w\s\-]+', text)
        for match in matches:
            award_patterns.append(match.strip())
    return award_patterns

# Find award patterns dynamically from the tweets
award_patterns = find_award_patterns(tweets)

# Count the frequency of each award pattern
award_counter = Counter(award_patterns)

In [16]:
# Get the top 25 most frequent awards
top_50_awards = award_counter.most_common(200)

In [17]:
top_25_awards_dict = {award: count for award, count in top_50_awards}

In [18]:
df= pd.DataFrame([top_25_awards_dict.keys(), top_25_awards_dict.values()]).T
df.columns= ['award', 'values'] 

In [19]:
df.head()

Unnamed: 0,award,values
0,best motion picture,423
1,best original song,401
2,best director,379
3,best actor,346
4,best actor in a motion picture,280


In [20]:
awards=df['award'].values

In [21]:
awards

array(['best motion picture', 'best original song', 'best director',
       'best actor', 'best actor in a motion picture', 'best picture',
       'best supporting actress in a motion picture - anne hathaway - les miserables -',
       'best original song -', 'best supporting actress',
       'best screenplay', 'best actress',
       'best actress in a motion picture',
       'best motion picture - comedy or musical - les miserables -',
       'best supporting actress in a motion picture',
       'best director - ben affleck',
       'best actor in a motion picture - comedy or musical - hugh jackman',
       'best tv series',
       'best actress in a motion picture - comedy or musical - jennifer lawrence - silver linings playbook -',
       'best supporting actor', 'best dressed',
       'best supporting actor in a motion picture - christoph waltz - django unchained -',
       'best motion picture - drama - argo -',
       'best supporting actress in a tv movie', 'best speech ever',
 

In [27]:
import re

def clean_awards(awards):
    transformed_awards = set()
    
    for award in awards:
        # Normalize text by converting to lowercase and removing leading/trailing whitespace
        clean_award = award.lower().strip()
        
        # Remove extra details like names, movie titles, URLs, etc.
        clean_award = re.sub(r' - .*', '', clean_award)  # Remove everything after the first " - "
        clean_award = re.sub(r'http\S+', '', clean_award)  # Remove URLs
        clean_award = re.sub(r'\b(for|goes to|awarded to|win at|at the)\b.*', '', clean_award)  # Remove specific trailing phrases
        clean_award = re.sub(r'\bbest performance by an .*', '', clean_award)  # Remove detailed performer descriptions
        
        # Standardize phrases
        clean_award = re.sub(r'\b(tv series|television series)\b', 'television series', clean_award)
        clean_award = re.sub(r'\bmini-series\b', 'mini-series or motion picture made for television', clean_award)
        clean_award = re.sub(r'\bmovie\b', 'motion picture', clean_award)
        
        # Add the cleaned award to the set to ensure uniqueness
        transformed_awards.add(clean_award.strip())
    
    # Sort the set and return it as a list
    return sorted(transformed_awards)

In [28]:
res= clean_awards(awards)

In [30]:
res

['',
 'best acceptance speech ever',
 'best actor',
 'best actor at',
 'best actor in a comedy or musical',
 'best actor in a comedy or musical television series',
 'best actor in a drama wa',
 'best actor in a miniseries',
 'best actor in a miniseries or tv motion picture',
 'best actor in a motion picture',
 'best actor in a television series',
 'best actor in a tv motion picture or miniseries',
 'best actor television series',
 'best actress',
 'best actress award',
 'best actress drama winner at golden globes 2013',
 'best actress here',
 'best actress in a comedy',
 'best actress in a comedy or musical',
 'best actress in a comedy series',
 'best actress in a drama',
 'best actress in a mini-series or motion picture made for television',
 'best actress in a miniseries or tv motion picture',
 'best actress in a motion picture',
 'best actress in a motion picture drama is jessica chastain',
 'best actress in a television series',
 'best actress in a television series comedy or music