## Task 3 Part c)

In [1]:
import os
import re
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, NMF
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import librosa
import librosa.display
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rishita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rishita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rishita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Paths to data folders
ANTHEM_TEXT_DIR = "national_anthems"  # Folder containing anthem text files

In [None]:
# Create results directory
RESULTS_DIR = "anthem_analysis_results"
os.makedirs(RESULTS_DIR, exist_ok=True)

#################################################################
# PART C: TEXTUAL ANALYSIS
#################################################################

def load_anthem_texts():
    """Load all anthem text files into a dictionary."""
    anthem_texts = {}
    
    for file_path in glob.glob(os.path.join(ANTHEM_TEXT_DIR, "*.txt")):
        country_name = os.path.basename(file_path).replace('.txt', '')
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                anthem_texts[country_name] = f.read()
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    
    print(f"Loaded {len(anthem_texts)} anthem text files.")
    return anthem_texts

def preprocess_text(text):
    """Preprocess text: lowercase, remove punctuation, remove stopwords, lemmatize."""
    # Lowercase
    text = text.lower()
    
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

def analyze_anthem_texts(anthem_texts):
    """Perform textual analysis on anthem texts."""
    print("\n--- TEXTUAL ANALYSIS ---")
    
    # Preprocess texts
    processed_texts = {country: preprocess_text(text) for country, text in anthem_texts.items()}
    
    # Basic statistics
    text_lengths = {country: len(text.split()) for country, text in processed_texts.items()}
    avg_length = np.mean(list(text_lengths.values()))
    print(f"Average anthem length (after preprocessing): {avg_length:.2f} words")
    
    # Find shortest and longest anthems
    shortest = min(text_lengths.items(), key=lambda x: x[1])
    longest = max(text_lengths.items(), key=lambda x: x[1])
    print(f"Shortest anthem: {shortest[0]} ({shortest[1]} words)")
    print(f"Longest anthem: {longest[0]} ({longest[1]} words)")
    
    # Create a single corpus for overall analysis
    all_text = ' '.join(processed_texts.values())
    
    # Most common words across all anthems
    words = all_text.split()
    word_counts = Counter(words)
    print("\nMost common words across all anthems:")
    for word, count in word_counts.most_common(15):
        print(f"{word}: {count}")
    
    # Create word cloud for all anthems
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Most Common Words in National Anthems')
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'anthem_wordcloud.png'))
    
    # TF-IDF Analysis
    vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = vectorizer.fit_transform([text for text in processed_texts.values()])
    
    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()
    
    # Create DataFrame for easier analysis
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                           index=list(processed_texts.keys()),
                           columns=feature_names)
    
    # Save TF-IDF matrix
    tfidf_df.to_csv(os.path.join(RESULTS_DIR, 'anthem_tfidf.csv'))
    
    # Dimensionality reduction for visualization
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(tfidf_matrix.toarray())
    
    # Create PCA plot
    plt.figure(figsize=(12, 8))
    plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.7)
    
    # Add country labels to points
    for i, country in enumerate(processed_texts.keys()):
        plt.annotate(country, (pca_result[i, 0], pca_result[i, 1]), 
                    fontsize=8, alpha=0.8)
    
    plt.title('PCA of National Anthem Texts')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'anthem_text_pca.png'))
    
    # Clustering analysis
    kmeans = KMeans(n_clusters=5, random_state=42)
    clusters = kmeans.fit_predict(tfidf_matrix)
    
    # Create a DataFrame with clustering results
    cluster_df = pd.DataFrame({
        'Country': list(processed_texts.keys()),
        'Cluster': clusters
    })
    
    # Print clusters
    print("\nAnthem text clusters:")
    for cluster_id in range(5):
        countries = cluster_df[cluster_df['Cluster'] == cluster_id]['Country'].tolist()
        print(f"Cluster {cluster_id}: {', '.join(countries[:5])}{'...' if len(countries) > 5 else ''}")
    
    # Save clustering results
    cluster_df.to_csv(os.path.join(RESULTS_DIR, 'anthem_text_clusters.csv'), index=False)
    
    # Topic modeling with NMF
    nmf = NMF(n_components=5, random_state=42)
    nmf_result = nmf.fit_transform(tfidf_matrix)
    
    # Get top words for each topic
    print("\nTop words for each topic:")
    for topic_idx, topic in enumerate(nmf.components_):
        top_words_idx = topic.argsort()[:-11:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        print(f"Topic {topic_idx}: {', '.join(top_words)}")
    
    # Save topic modeling results
    nmf_df = pd.DataFrame(nmf_result, index=list(processed_texts.keys()))
    nmf_df.to_csv(os.path.join(RESULTS_DIR, 'anthem_topics.csv'))
    
    # Sentiment analysis could be added here with TextBlob or VADER
    
    return processed_texts, tfidf_matrix, tfidf_df


def main():
    """Main function to run the analysis."""
    print("Starting National Anthem Multimodal Analysis...")
    
    # Part C: Textual Analysis
    anthem_texts = load_anthem_texts()
    processed_texts, tfidf_matrix, tfidf_df = analyze_anthem_texts(anthem_texts)

    print(f"\nAnalysis complete. Results saved to {RESULTS_DIR}")

if __name__ == "__main__":
    main()