## 1. Setup: Clone Repository and Install Dependencies

In [None]:
# Clone the repository (if running in Colab)
import os
if 'google.colab' in str(get_ipython()):
    print("Running in Google Colab - cloning repository...")
    !git clone https://github.com/YOUR_USERNAME/Project_NLP.git
    os.chdir('Project_NLP')
else:
    print("Running locally")
    # Adjust path if needed
    os.chdir('..')

In [None]:
# Install required packages
!pip install -q conllu pandas numpy nltk scikit-learn matplotlib seaborn

In [None]:
# Download NLTK data
import nltk
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
print("✓ NLTK data downloaded")

## 2. Download Albanian Corpus (if not present)

In [None]:
# Check if data exists, if not download it
import os
from pathlib import Path

data_file = Path('data/sq_tsa-ud-test.conllu')
if not data_file.exists():
    print("Downloading Albanian corpus...")
    os.makedirs('data', exist_ok=True)
    !curl -L "https://raw.githubusercontent.com/UniversalDependencies/UD_Albanian-TSA/master/sq_tsa-ud-test.conllu" -o data/sq_tsa-ud-test.conllu
    print("✓ Albanian corpus downloaded")
else:
    print("✓ Albanian corpus already present")

## 3. Import Modules

In [None]:
import sys
sys.path.append('src')

from src.ud_loader import load_conllu_file, extract_sentence_data
from src.corpus_statistics import (compute_corpus_statistics, compute_pos_distribution,
                                    create_statistics_summary, get_top_frequent_words,
                                    get_top_frequent_lemmas)
from src.preprocessor import TextPreprocessor
from src.similarity import SimilarityAnalyzer
from src.visualizer import (plot_pos_distribution, plot_similarity_distribution,
                             plot_sentence_length_distribution, plot_top_frequent_words)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print("✓ Modules imported successfully")

## 4. Load Albanian Corpus

In [None]:
CONLLU_FILE = "data/sq_tsa-ud-test.conllu"
LANGUAGE = "Albanian"

print(f"Loading {LANGUAGE} corpus from {CONLLU_FILE}...")
sentences = load_conllu_file(CONLLU_FILE)
print(f"✓ Loaded {len(sentences)} sentences")

corpus_data = extract_sentence_data(sentences)
print(f"✓ Extracted tokens, lemmas, and PoS tags")

## 5. Corpus Statistics

In [None]:
stats = compute_corpus_statistics(corpus_data)
stats_df = create_statistics_summary(stats)

print("Albanian Corpus Statistics:")
print("=" * 60)
display(stats_df)

## 6. Part-of-Speech Distribution

In [None]:
pos_df = compute_pos_distribution(corpus_data['all_pos_tags'])

print("Albanian PoS Tag Distribution:")
print("=" * 60)
display(pos_df.head(10))

# Visualize
os.makedirs('outputs', exist_ok=True)
plot_pos_distribution(pos_df, language=LANGUAGE, save_path='outputs/pos_distribution.png')
print("\n✓ Visualization saved to outputs/pos_distribution.png")

## 7. Sentence Length Distribution

In [None]:
sent_lengths = [len(sent) for sent in corpus_data['sentence_tokens']]
plot_sentence_length_distribution(sent_lengths, language=LANGUAGE,
                                 save_path='outputs/sentence_length_distribution.png')
print("✓ Sentence length distribution saved")

## 8. Most Frequent Words and Lemmas

In [None]:
top_words = get_top_frequent_words(corpus_data['all_tokens'], top_n=20)
print("Top 20 Most Frequent Albanian Words:")
print("=" * 60)
display(top_words)

top_lemmas = get_top_frequent_lemmas(corpus_data['all_lemmas'], top_n=20)
print("\nTop 20 Most Frequent Albanian Lemmas:")
print("=" * 60)
display(top_lemmas)

## 9. Text Preprocessing Example

In [None]:
preprocessor = TextPreprocessor()

# Example Albanian sentence
test_sentence = "Përpunimi i gjuhës natyrore është shumë interesant."

result = preprocessor.process_sentence(test_sentence)

print("Text Preprocessing Example:")
print("=" * 60)
print(f"Original: {result['original']}")
print(f"Tokens:   {result['tokens']}")
print(f"Lemmas:   {result['lemmas']}")
print(f"Stems:    {result['stems']}")

## 10. TF-IDF Vectorization

In [None]:
SUBSET_SIZE = min(1000, len(corpus_data['sentence_texts']))
subset_sentences = corpus_data['sentence_texts'][:SUBSET_SIZE]

analyzer = SimilarityAnalyzer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words=None
)

tfidf_matrix = analyzer.fit_transform(subset_sentences)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of features: {tfidf_matrix.shape[1]}")
print(f"Number of sentences: {tfidf_matrix.shape[0]}")

# Sample TF-IDF scores
sample_tfidf = analyzer.get_sentence_tfidf(5, top_n=10)
print("\nTop TF-IDF terms for sentence 5:")
display(sample_tfidf)

## 11. Similarity Analysis

In [None]:
# Compute similarity matrices
cosine_sim = analyzer.compute_cosine_similarity()
euclidean_dist = analyzer.compute_euclidean_distance()

print("Similarity Analysis:")
print("=" * 60)

# Statistics
sim_stats = analyzer.get_similarity_statistics(cosine_sim)
print("\nCosine Similarity Statistics:")
for key, value in sim_stats.items():
    print(f"  {key.capitalize()}: {value:.4f}")

# Example comparison
if len(subset_sentences) > 25:
    comparison = analyzer.compare_sentences(10, 25, cosine_sim, euclidean_dist)
    print("\nExample: Sentence 10 vs Sentence 25")
    print(f"  Cosine Similarity: {comparison['cosine_similarity']:.4f}")
    print(f"  Euclidean Distance: {comparison['euclidean_distance']:.4f}")

## 12. Most Similar Sentence Pairs

In [None]:
most_similar = analyzer.find_most_similar_pairs(subset_sentences, cosine_sim, top_n=10)

print("Top 10 Most Similar Albanian Sentence Pairs:")
print("=" * 80)

for rank, (idx1, idx2, similarity, sent1, sent2) in enumerate(most_similar, 1):
    print(f"\nRank {rank} - Similarity: {similarity:.4f}")
    print(f"  [{idx1}] {sent1}")
    print(f"  [{idx2}] {sent2}")
    print("-" * 80)

## 13. Visualize Similarity Distribution

In [None]:
upper_triangle = np.triu_indices(cosine_sim.shape[0], k=1)
similarity_values = cosine_sim[upper_triangle]

plot_similarity_distribution(similarity_values, 
                            save_path='outputs/similarity_distribution.png')
print("✓ Similarity distribution plot saved")

## 14. Save Results

In [None]:
# Create reports directory
os.makedirs('reports', exist_ok=True)

# Save statistics
stats_df.to_csv('reports/corpus_statistics.csv', index=False)
pos_df.to_csv('reports/pos_distribution.csv', index=False)
top_words.to_csv('reports/top_frequent_words.csv', index=False)
top_lemmas.to_csv('reports/top_frequent_lemmas.csv', index=False)

# Save similar pairs
similar_results = []
for idx1, idx2, similarity, sent1, sent2 in most_similar:
    similar_results.append({
        'Rank': len(similar_results) + 1,
        'Sentence_1_Index': idx1,
        'Sentence_2_Index': idx2,
        'Similarity': similarity,
        'Sentence_1': sent1,
        'Sentence_2': sent2
    })

similar_df = pd.DataFrame(similar_results)
similar_df.to_csv('reports/most_similar_pairs.csv', index=False)

print("✓ All results saved to reports/ directory")
print("\nGenerated files:")
print("  - reports/corpus_statistics.csv")
print("  - reports/pos_distribution.csv")
print("  - reports/top_frequent_words.csv")
print("  - reports/top_frequent_lemmas.csv")
print("  - reports/most_similar_pairs.csv")
print("  - outputs/pos_distribution.png")
print("  - outputs/sentence_length_distribution.png")
print("  - outputs/similarity_distribution.png")

## 15. Display Visualizations

In [None]:
from IPython.display import Image, display

print("Albanian PoS Distribution:")
display(Image('outputs/pos_distribution.png'))

print("\nSentence Length Distribution:")
display(Image('outputs/sentence_length_distribution.png'))

print("\nSimilarity Distribution:")
display(Image('outputs/similarity_distribution.png'))