In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import random



nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')



ner_model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
ner = pipeline("ner", model=ner_model_name, aggregation_strategy="simple")


disaster_words = set(["earthquake", "flood", "cyclone", "landslide", "drought", "heatwave", "thunderstorm", "cloudburst", "tornado", "tsunami"])


indian_cities = [
    "Mumbai", "Delhi", "Bangalore", "Hyderabad", "Chennai", "Kolkata", "Pune", "Ahmedabad", "Jaipur", "Lucknow",
    "Kanpur", "Nagpur", "Indore", "Thane", "Bhopal", "Visakhapatnam", "Pimpri-Chinchwad", "Patna", "Vadodara", "Ghaziabad",
    "Ludhiana", "Agra", "Nashik", "Faridabad", "Meerut", "Rajkot", "Kalyan-Dombivli", "Vasai-Virar", "Varanasi", "Srinagar",
    "Aurangabad", "Dhanbad", "Amritsar", "Navi Mumbai", "Allahabad", "Ranchi", "Howrah", "Coimbatore", "Jabalpur", "Gwalior",
    "Vijayawada", "Jodhpur", "Madurai", "Raipur", "Kota", "Guwahati", "Chandigarh", "Solapur", "Hubballi-Dharwad", "Tiruchirappalli"
]

def generate_indian_disaster_tweets(num_tweets=1000):
    disaster_templates = [
        "Massive {} hits {}! People struggling to cope.",
        "Breaking: {} strikes {} causing widespread panic.",
        "{} in {} leaves thousands stranded. Govt. agencies on high alert.",
        "Unexpected {} catches {} off guard. Emergency services overwhelmed.",
        "{} warning issued for {}. Residents advised to stay indoors.",
        "{}. {} reels under nature's fury. CM announces relief measures.",
        "{}. Chaos in {} as authorities scramble to respond.",
        "{} wreaks havoc in {}. Schools and offices closed.",
        "{}. {} faces worst disaster in decades. PM assures all help.",
        "Terrifying {} hits {}. Social media flooded with rescue requests."
    ]

    non_disaster_templates = [
        "{} gears up for festive season. Markets buzzing with activity.",
        "New metro line inaugurated in {}. Commuters rejoice!",
        "{} hosts tech summit. Startups showcase cutting-edge innovations.",
        "Annual food festival kicks off in {}. Foodies flock in large numbers.",
        "{} sets new record in cleanliness drive. Mayor lauds citizen effort.",
        "Cultural extravaganza in {} draws tourists from across the globe.",
        "{}. Students shine in board exams. Parents and teachers proud.",
        "Traffic woes in {} as new flyover construction begins.",
        "{}. Local team clinches victory in inter-city cricket tournament.",
        "Art exhibition in {} showcases rich cultural heritage."
    ]

    tweets = []
    for _ in range(num_tweets):
        if random.random() < 0.7:  #
            template = random.choice(disaster_templates)
            disaster = random.choice(list(disaster_words)).capitalize()
            city = random.choice(indian_cities)
            tweet = template.format(disaster, city)
        else:
            template = random.choice(non_disaster_templates)
            city = random.choice(indian_cities)
            tweet = template.format(city)
        tweets.append(tweet)
    
    return tweets

corpus = generate_indian_disaster_tweets(5000)


def extract_location(tweet):
    entities = ner(tweet)
    locations = [entity['word'] for entity in entities if entity['entity_group'] in ['LOC', 'GPE']]
    return locations[0] if locations else None

def extract_disaster_type(tweet, corpus):
    # Tokenize and POS tag
    tokens = word_tokenize(tweet.lower())
    pos_tags = pos_tag(tokens)
    
    # Keep only nouns and verbs
    filtered_tokens = [word for word, pos in pos_tags if pos.startswith('NN') or pos.startswith('VB')]
    
    # Calculate TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus + [tweet])
    feature_names = vectorizer.get_feature_names_out()
    
    # Get TF-IDF scores for the last document (our tweet)
    tfidf_scores = dict(zip(feature_names, tfidf_matrix.toarray()[-1]))
    
    # Custom scoring function
    def score_word(word):
        base_score = tfidf_scores.get(word, 0)
        if word in disaster_words:
            base_score *= 2  # Double the score for known disaster words
        return base_score
    
    # Score words based on custom scoring function
    scored_words = [(word, score_word(word)) for word in filtered_tokens]
    
    # Sort by score and get top word
    top_words = sorted(scored_words, key=lambda x: x[1], reverse=True)
    
    # Return the highest-scoring word that's in our disaster_words set, if any
    for word, score in top_words:
        if word in disaster_words:
            return word
    
    # If no disaster word is found, return the top-scoring word
    return top_words[0][0] if top_words else None

# Test tweets
test_tweets = [
    "Massive earthquake hits Mumbai! People struggling to cope.",
    "Breaking: Cyclone strikes Chennai causing widespread panic.",
    "Flood in Patna leaves thousands stranded. Govt. agencies on high alert.",
    "Unexpected landslide catches Shimla off guard. Emergency services overwhelmed.",
    "Drought warning issued for Marathwada. Residents advised to conserve water.",
    "Bangalore gears up for festive season. Markets buzzing with activity.",
    "New metro line inaugurated in Delhi. Commuters rejoice!",
    "Ahmedabad hosts tech summit. Startups showcase cutting-edge innovations.",
    "Heatwave. Nagpur reels under nature's fury. CM announces relief measures.",
    "Cloudburst wreaks havoc in Uttarakhand. Schools and offices closed."
]

for tweet in test_tweets:
    disaster_type = extract_disaster_type(tweet, corpus)
    location = extract_location(tweet)
    print(f"Tweet: {tweet}")
    print(f"Disaster Type: {disaster_type}")
    print(f"Location: {location}\n")