# Medium Articles NLP Analysis

This notebook performs Natural Language Processing on a large dataset of Medium articles. Due to the size of the dataset, we'll implement techniques to handle memory constraints.

## 1. Import Libraries

In [2]:
# Data manipulation libraries
import pandas as pd
import numpy as np
import json
from datetime import datetime
import os
import gc
import pickle

# NLP libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from bs4 import BeautifulSoup
from collections import Counter

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# For memory usage tracking
import psutil

# # Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aswat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aswat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aswat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Helper Functions for Memory Management

In [3]:
def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / (1024 * 1024)  # Convert to MB

def print_memory_usage(message=""):
    """Print memory usage with optional message"""
    memory_mb = get_memory_usage()
    print(f"{message} - Memory usage: {memory_mb:.2f} MB")
    
# Create directory for intermediate results if it doesn't exist
if not os.path.exists('intermediate_data'):
    os.makedirs('intermediate_data')

## 2. Load the Dataset

In [4]:
# We'll use chunking to handle large files

def load_dataset_in_chunks(file_path, chunk_size=10000):
    """Load large Csv dataset in chunks"""
    print(f"Loading dataset from {file_path}...")
    
    chunks = []
    total_rows = 0
    
    # Use chunksize to load in batches
    for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size)):
        print(f"Loaded chunk {i+1} with {len(chunk)} rows")
        chunks.append(chunk)
        total_rows += len(chunk)
        print_memory_usage()
    
    print(f"Total rows loaded: {total_rows}")
    return chunks

# Replace 'medium_articles.json' with your actual file path
file_path = 'medium_articles.csv'  
data_chunks = load_dataset_in_chunks(file_path)

Loading dataset from medium_articles.csv...
Loaded chunk 1 with 10000 rows
 - Memory usage: 341.80 MB
Loaded chunk 2 with 10000 rows
 - Memory usage: 455.39 MB
Loaded chunk 3 with 10000 rows
 - Memory usage: 562.93 MB
Loaded chunk 4 with 10000 rows
 - Memory usage: 674.85 MB
Loaded chunk 5 with 10000 rows
 - Memory usage: 784.25 MB
Loaded chunk 6 with 10000 rows
 - Memory usage: 895.07 MB
Loaded chunk 7 with 10000 rows
 - Memory usage: 1002.96 MB
Loaded chunk 8 with 10000 rows
 - Memory usage: 1100.00 MB
Loaded chunk 9 with 10000 rows
 - Memory usage: 1198.41 MB
Loaded chunk 10 with 10000 rows
 - Memory usage: 1295.29 MB
Loaded chunk 11 with 10000 rows
 - Memory usage: 1392.66 MB
Loaded chunk 12 with 10000 rows
 - Memory usage: 1489.96 MB
Loaded chunk 13 with 10000 rows
 - Memory usage: 1585.64 MB
Loaded chunk 14 with 10000 rows
 - Memory usage: 1685.90 MB
Loaded chunk 15 with 10000 rows
 - Memory usage: 1801.27 MB
Loaded chunk 16 with 10000 rows
 - Memory usage: 1919.40 MB
Loaded chun

## 3. Data Exploration

In [5]:
# Examine the first chunk to understand the data structure
sample_df = data_chunks[0]

# Display basic information
print("Sample data structure:")
sample_df.head()

Sample data structure:


Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [6]:
# Display dataframe info
print("\nDataframe info:")
sample_df.info()


Dataframe info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      9999 non-null   object
 1   text       10000 non-null  object
 2   url        10000 non-null  object
 3   authors    10000 non-null  object
 4   timestamp  10000 non-null  object
 5   tags       10000 non-null  object
dtypes: object(6)
memory usage: 468.9+ KB


In [7]:
# Display statistics for numeric columns (if any)
print("\nDescriptive statistics:")
sample_df.describe(include='all')


Descriptive statistics:


Unnamed: 0,title,text,url,authors,timestamp,tags
count,9999,10000,10000,10000,10000,10000
unique,9976,9912,10000,5652,9988,9842
top,Our FAQs,in In Fitness And In Health,https://medium.com/brian-berg/myfitnesspal-re-...,[],2020-11-19 01:16:58.476000+00:00,['Startup']
freq,8,21,1,691,8,9


In [8]:
# Check for column data types and null values across all chunks
def analyze_all_chunks(chunks):
    total_rows = 0
    null_counts = {}
    
    for i, chunk in enumerate(chunks):
        total_rows += len(chunk)
        
        # Count nulls in this chunk
        chunk_nulls = chunk.isnull().sum()
        
        # Update total null counts
        for col in chunk.columns:
            if col not in null_counts:
                null_counts[col] = 0
            null_counts[col] += chunk_nulls[col]
    
    print(f"Total rows across all chunks: {total_rows}")
    print("\nNull values per column:")
    for col, count in null_counts.items():
        percentage = (count / total_rows) * 100
        print(f"{col}: {count} nulls ({percentage:.2f}%)")

analyze_all_chunks(data_chunks)

Total rows across all chunks: 192368

Null values per column:
title: 5 nulls (0.00%)
text: 0 nulls (0.00%)
url: 0 nulls (0.00%)
authors: 0 nulls (0.00%)
timestamp: 2 nulls (0.00%)
tags: 0 nulls (0.00%)


## 4. Data Type Conversion

In [9]:
import ast  # Add this import at the top

def convert_data_types(df):
    """Convert data types for timestamp, authors, and tags"""
    df_copy = df.copy()
    
    # Convert timestamp to datetime
    if 'timestamp' in df_copy.columns:
        df_copy['timestamp'] = pd.to_datetime(df_copy['timestamp'], errors='coerce')

    # Helper function to safely evaluate strings into lists
    def safe_parse_list(x):
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except (ValueError, SyntaxError):
                return []
        elif isinstance(x, list):
            return x
        else:
            return []

    # Ensure authors and tags are proper lists
    if 'authors' in df_copy.columns:
        df_copy['authors'] = df_copy['authors'].apply(safe_parse_list)

    if 'tags' in df_copy.columns:
        df_copy['tags'] = df_copy['tags'].apply(safe_parse_list)
    
    return df_copy

# Process each chunk and save as intermediate file
for i, chunk in enumerate(data_chunks):
    print(f"Converting data types for chunk {i+1}...")
    converted_chunk = convert_data_types(chunk)
    
    # Save intermediate result
    intermediate_file = f'intermediate_data/converted_chunk_{i}.pkl'
    converted_chunk.to_pickle(intermediate_file)
    print(f"Saved converted chunk to {intermediate_file}")
    print_memory_usage()

# Clear memory
del data_chunks
gc.collect()
print_memory_usage("After clearing data chunks")

Converting data types for chunk 1...
Saved converted chunk to intermediate_data/converted_chunk_0.pkl
 - Memory usage: 2369.83 MB
Converting data types for chunk 2...
Saved converted chunk to intermediate_data/converted_chunk_1.pkl
 - Memory usage: 2428.20 MB
Converting data types for chunk 3...
Saved converted chunk to intermediate_data/converted_chunk_2.pkl
 - Memory usage: 2479.93 MB
Converting data types for chunk 4...
Saved converted chunk to intermediate_data/converted_chunk_3.pkl
 - Memory usage: 2530.85 MB
Converting data types for chunk 5...
Saved converted chunk to intermediate_data/converted_chunk_4.pkl
 - Memory usage: 2582.06 MB
Converting data types for chunk 6...
Saved converted chunk to intermediate_data/converted_chunk_5.pkl
 - Memory usage: 2632.87 MB
Converting data types for chunk 7...
Saved converted chunk to intermediate_data/converted_chunk_6.pkl
 - Memory usage: 2684.05 MB
Converting data types for chunk 8...
Saved converted chunk to intermediate_data/converted_

## 5. Data Cleaning

In [10]:
def clean_data(df):
    """Clean data by removing nulls, filling empty lists, and cleaning text"""
    df_clean = df.copy()
    
    # Remove rows with null values in critical columns
    critical_cols = ['title', 'text', 'url']
    df_clean = df_clean.dropna(subset=critical_cols)
    
    # Replace empty lists
    if 'authors' in df_clean.columns:
        df_clean['authors'] = df_clean['authors'].apply(lambda x: ['Unknown'] if not x else x)
    
    if 'tags' in df_clean.columns:
        df_clean['tags'] = df_clean['tags'].apply(lambda x: ['Untagged'] if not x else x)
    
    # Clean text columns
    def clean_text(text):
        if not isinstance(text, str):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()    
        
        # Remove newline characters
        text = re.sub(r'\n+', ' ', text)
        
        return text
    
    # Apply text cleaning to title and text columns
    if 'title' in df_clean.columns:
        df_clean['clean_title'] = df_clean['title'].apply(clean_text)
    
    if 'text' in df_clean.columns:
        df_clean['clean_text'] = df_clean['text'].apply(clean_text)
    
    return df_clean

# Process each converted chunk and save as intermediate file
num_chunks = len([f for f in os.listdir('intermediate_data') if f.startswith('converted_chunk_')])

for i in range(num_chunks):
    # Load the converted chunk
    file_path = f'intermediate_data/converted_chunk_{i}.pkl'
    print(f"Loading {file_path}...")
    chunk = pd.read_pickle(file_path)
    
    # Clean the chunk
    print(f"Cleaning chunk {i+1}...")
    cleaned_chunk = clean_data(chunk)
    
    # Save cleaned chunk
    cleaned_file = f'intermediate_data/cleaned_chunk_{i}.pkl'
    cleaned_chunk.to_pickle(cleaned_file)
    print(f"Saved cleaned chunk to {cleaned_file}")
    print_memory_usage()
    
    # Free memory
    del chunk, cleaned_chunk
    gc.collect()

print_memory_usage("After cleaning all chunks")

Loading intermediate_data/converted_chunk_0.pkl...
Cleaning chunk 1...
Saved cleaned chunk to intermediate_data/cleaned_chunk_0.pkl
 - Memory usage: 734.79 MB
Loading intermediate_data/converted_chunk_1.pkl...
Cleaning chunk 2...
Saved cleaned chunk to intermediate_data/cleaned_chunk_1.pkl
 - Memory usage: 726.70 MB
Loading intermediate_data/converted_chunk_2.pkl...
Cleaning chunk 3...
Saved cleaned chunk to intermediate_data/cleaned_chunk_2.pkl
 - Memory usage: 713.14 MB
Loading intermediate_data/converted_chunk_3.pkl...
Cleaning chunk 4...
Saved cleaned chunk to intermediate_data/cleaned_chunk_3.pkl
 - Memory usage: 718.69 MB
Loading intermediate_data/converted_chunk_4.pkl...
Cleaning chunk 5...
Saved cleaned chunk to intermediate_data/cleaned_chunk_4.pkl
 - Memory usage: 717.50 MB
Loading intermediate_data/converted_chunk_5.pkl...
Cleaning chunk 6...
Saved cleaned chunk to intermediate_data/cleaned_chunk_5.pkl
 - Memory usage: 714.77 MB
Loading intermediate_data/converted_chunk_6.pk

## 6. NLP Preprocessing

In [11]:
def preprocess_text_for_nlp(df):
    """Apply NLP preprocessing: tokenization, remove stopwords, lemmatization, stemming"""
    df_nlp = df.copy()
    
    # Initialize tools
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    
    def process_text(text):
        if not isinstance(text, str) or not text:
            return [], [], []
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        
        # Lemmatize
        lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens]
        
        # Stem
        stemmed = [stemmer.stem(word) for word in filtered_tokens]
        
        return filtered_tokens, lemmatized, stemmed
    
    # Process text and title
    if 'clean_text' in df_nlp.columns:
        print("Processing text column...")
        # Process in batches to avoid memory issues
        batch_size = 1000
        tokens_list = []
        lemmatized_list = []
        stemmed_list = []
        
        for i in range(0, len(df_nlp), batch_size):
            print(f"Processing batch {i//batch_size + 1}...")
            batch = df_nlp['clean_text'].iloc[i:i+batch_size]
            
            batch_results = [process_text(text) for text in batch]
            
            # Unpack results
            batch_tokens, batch_lemmatized, batch_stemmed = zip(*batch_results)
            
            tokens_list.extend(batch_tokens)
            lemmatized_list.extend(batch_lemmatized)
            stemmed_list.extend(batch_stemmed)
            
            # Print memory usage after each batch
            if (i//batch_size) % 10 == 0:
                print_memory_usage(f"After processing batch {i//batch_size + 1}")
        
        # Add results to dataframe
        df_nlp['tokens'] = tokens_list
        df_nlp['lemmatized'] = lemmatized_list
        df_nlp['stemmed'] = stemmed_list
    
    # Process title (simpler as titles are shorter)
    if 'clean_title' in df_nlp.columns:
        print("Processing title column...")
        title_results = [process_text(title) for title in df_nlp['clean_title']]
        title_tokens, title_lemmatized, title_stemmed = zip(*title_results)
        
        df_nlp['title_tokens'] = list(title_tokens)
        df_nlp['title_lemmatized'] = list(title_lemmatized)
        df_nlp['title_stemmed'] = list(title_stemmed)
    
    return df_nlp

# Process each cleaned chunk for NLP
num_chunks = len([f for f in os.listdir('intermediate_data') if f.startswith('cleaned_chunk_')])

for i in range(num_chunks):
    # Load the cleaned chunk
    file_path = f'intermediate_data/cleaned_chunk_{i}.pkl'
    print(f"Loading {file_path}...")
    chunk = pd.read_pickle(file_path)
    
    # Preprocess for NLP
    print(f"Preprocessing chunk {i+1} for NLP...")
    nlp_chunk = preprocess_text_for_nlp(chunk)
    
    # Save NLP preprocessed chunk
    nlp_file = f'intermediate_data/nlp_chunk_{i}.pkl'
    nlp_chunk.to_pickle(nlp_file)
    print(f"Saved NLP preprocessed chunk to {nlp_file}")
    print_memory_usage()
    
    # Free memory
    del chunk, nlp_chunk
    gc.collect()

print_memory_usage("After NLP preprocessing all chunks")

Loading intermediate_data/cleaned_chunk_0.pkl...
Preprocessing chunk 1 for NLP...
Processing text column...
Processing batch 1...
After processing batch 1 - Memory usage: 800.41 MB
Processing batch 2...
Processing batch 3...
Processing batch 4...
Processing batch 5...
Processing batch 6...
Processing batch 7...
Processing batch 8...
Processing batch 9...
Processing batch 10...
Processing title column...
Saved NLP preprocessed chunk to intermediate_data/nlp_chunk_0.pkl
 - Memory usage: 1421.54 MB
Loading intermediate_data/cleaned_chunk_1.pkl...
Preprocessing chunk 2 for NLP...
Processing text column...
Processing batch 1...
After processing batch 1 - Memory usage: 900.59 MB
Processing batch 2...
Processing batch 3...
Processing batch 4...
Processing batch 5...
Processing batch 6...
Processing batch 7...
Processing batch 8...
Processing batch 9...
Processing batch 10...
Processing title column...
Saved NLP preprocessed chunk to intermediate_data/nlp_chunk_1.pkl
 - Memory usage: 1383.76 M

## 7. Exploratory Data Analysis

### 7.1 Word Count Analysis

In [None]:
# Calculate word count statistics across all chunks
def word_count_analysis():
    num_chunks = len([f for f in os.listdir('intermediate_data') if f.startswith('nlp_chunk_')])
    
    # Initialize stats
    title_lengths = []
    text_lengths = []
    
    for i in range(num_chunks):
        # Load chunk
        file_path = f'intermediate_data/nlp_chunk_{i}.pkl'
        print(f"Analyzing word counts in {file_path}...")
        chunk = pd.read_pickle(file_path)
        
        # Calculate lengths
        if 'tokens' in chunk.columns:
            chunk_text_lengths = chunk['tokens'].apply(len)
            text_lengths.extend(chunk_text_lengths)
        
        if 'title_tokens' in chunk.columns:
            chunk_title_lengths = chunk['title_tokens'].apply(len)
            title_lengths.extend(chunk_title_lengths)
        
        # Free memory
        del chunk
        gc.collect()
    
    # Convert to Series for analysis
    title_lengths = pd.Series(title_lengths)
    text_lengths = pd.Series(text_lengths)
    
    # Calculate statistics
    print("\nTitle word count statistics:")
    print(title_lengths.describe())
    
    print("\nArticle text word count statistics:")
    print(text_lengths.describe())
    
    # Create histograms
    plt.figure(figsize=(15, 6))
    
    plt.subplot(1, 2, 1)
    sns.histplot(title_lengths, kde=True)
    plt.title('Distribution of Title Word Counts')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    sns.histplot(text_lengths.clip(upper=1000), kde=True)  # Clip to avoid extreme outliers
    plt.title('Distribution of Article Word Counts (capped at 1000)')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.savefig('word_count_distribution.png')
    plt.show()
    
    return title_lengths, text_lengths

title_lengths, text_lengths = word_count_analysis()

### 7.2 Most Common Words

In [None]:
def find_most_common_words(field='lemmatized', n=30):
    """Find most common words across all chunks for a given field"""
    num_chunks = len([f for f in os.listdir('intermediate_data') if f.startswith('nlp_chunk_')])
    
    # Initialize counter
    word_counter = Counter()
    
    for i in range(num_chunks):
        # Load chunk
        file_path = f'intermediate_data/nlp_chunk_{i}.pkl'
        print(f"Finding common words in {file_path}...")
        chunk = pd.read_pickle(file_path)
        
        # Count words
        if field in chunk.columns:
            # Flatten list of lists and count
            words = [word for word_list in chunk[field] for word in word_list if len(word) > 1]
            word_counter.update(words)
        
        # Free memory
        del chunk
        gc.collect()
    
    # Get most common words
    most_common = word_counter.most_common(n)
    
    # Plot results
    plt.figure(figsize=(12, 8))
    words, counts = zip(*most_common)
    sns.barplot(x=list(counts), y=list(words))
    plt.title(f'Top {n} Most Common Words')
    plt.xlabel('Frequency')
    plt.tight_layout()
    plt.savefig(f'most_common_{field}.png')
    plt.show()
    
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(most_common))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud of Top Words')
    plt.savefig(f'wordcloud_{field}.png')
    plt.show()
    
    return most_common

# Find most common lemmatized words
most_common_words = find_most_common_words('lemmatized')

### 7.3 Most Common Tags

In [None]:
def find_most_common_tags(n=20):
    """Find most common tags across all chunks"""
    num_chunks = len([f for f in os.listdir('intermediate_data') if f.startswith('nlp_chunk_')])
    
    # Initialize counter
    tag_counter = Counter()
    
    for i in range(num_chunks):
        # Load chunk
        file_path = f'intermediate_data/nlp_chunk_{i}.pkl'
        print(f"Finding common tags in {file_path}...")
        chunk = pd.read_pickle(file_path)
        
        # Count tags
        if 'tags' in chunk.columns:
            # Flatten list of lists and count
            tags = [tag for tag_list in chunk['tags'] for tag in tag_list]
            tag_counter.update(tags)
        
        # Free memory
        del chunk
        gc.collect()
    
    # Get most common tags
    most_common = tag_counter.most_common(n)
    
    # Plot results
    plt.figure(figsize=(12, 8))
    tags, counts = zip(*most_common)
    sns.barplot(x=list(counts), y=list(tags))
    plt.title(f'Top {n} Most Common Tags')
    plt.xlabel('Frequency')
    plt.tight_layout()
    plt.savefig('most_common_tags.png')
    plt.show()
    
    return most_common

# Find most common tags
most_common_tags = find_most_common_tags()

### 7.4 Authors with Most Articles

In [None]:
def find_most_prolific_authors(n=20):
    """Find authors with the most articles across all chunks"""
    num_chunks = len([f for f in os.listdir('intermediate_data') if f.startswith('nlp_chunk_')])
    
    # Initialize counter
    author_counter = Counter()
    
    for i in range(num_chunks):
        # Load chunk
        file_path = f'intermediate_data/nlp_chunk_{i}.pkl'
        print(f"Analyzing authors in {file_path}...")
        chunk = pd.read_pickle(file_path)
        
        # Count authors
        if 'authors' in chunk.columns:
            # Flatten list of lists and count
            authors = [author for author_list in chunk['authors'] for author in author_list]
            author_counter.update(authors)
        
        # Free memory
        del chunk
        gc.collect()
    
    # Get most prolific authors
    most_prolific = author_counter.most_common(n)
    
    # Plot results
    plt.figure(figsize=(12, 10))
    authors, counts = zip(*most_prolific)
    sns.barplot(x=list(counts), y=list(authors))
    plt.title(f'Top {n} Most Prolific Authors')
    plt.xlabel('Number of Articles')
    plt.tight_layout()
    plt.savefig('most_prolific_authors.png')
    plt.show()
    
    return most_prolific

# Find most prolific authors
most_prolific_authors = find_most_prolific_authors()

### 7.5 Most Frequent N-grams

In [None]:
def generate_ngrams(tokens_list, n=2):
    """Generate n-grams from a list of tokens"""
    ngrams = []
    for tokens in tokens_list:
        if len(tokens) >= n:
            # Generate n-grams
            grams = [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
            ngrams.extend(grams)
    return ngrams

def find_most_common_ngrams(n_gram=2, top_n=20):
    """Find most common n-grams across all chunks"""
    num_chunks = len([f for f in os.listdir('intermediate_data') if f.startswith('nlp_chunk_')])
    
    # Initialize counter
    ngram_counter = Counter()
    
    for i in range(num_chunks):
        # Load chunk
        file_path = f'intermediate_data/nlp_chunk_{i}.pkl'
        print(f"Analyzing {n_gram}-grams in {file_path}...")
        chunk = pd.read_pickle(file_path)
        
        # Generate and count n-grams
        if 'lemmatized' in chunk.columns:
            chunk_ngrams = generate_ngrams(chunk['lemmatized'], n=n_gram)
            ngram_counter.update(chunk_ngrams)
        
        # Free memory
        del chunk
        gc.collect()
    
    # Get most common n-grams
    most_common = ngram_counter.most_common(top_n)
    
    # Plot results
    plt.figure(figsize=(12, 8))
    ngrams, counts = zip(*most_common)
    sns.barplot(x=list(counts), y=list(ngrams))
    plt.title(f'Top {top_n} Most Common {n_gram}-grams')
    plt.xlabel('Frequency')
    plt.tight_layout()
    plt.savefig(f'most_common_{n_gram}grams.png')
    plt.show()
    
    return most_common

# Find most common bi-grams and tri-grams
most_common_bigrams = find_most_common_ngrams(n_gram=2)
most_common_trigrams = find_most_common_ngrams(n_gram=3)

### 7.6 Publication Trends Over Time

In [None]:
def analyze_publication_trends():
    """Analyze publication trends over time"""
    num_chunks = len([f for f in os.listdir('intermediate_data') if f.startswith('nlp_chunk_')])
    
    # Initialize lists to store timestamps
    timestamps = []
    
    for i in range(num_chunks):
        # Load chunk
        file_path = f'intermediate_data/nlp_chunk_{i}.pkl'
        print(f"Analyzing timestamps in {file_path}...")
        chunk = pd.read_pickle(file_path)
        
        # Extract timestamps
        if 'timestamp' in chunk.columns:
            timestamps.extend(chunk['timestamp'].dropna())
        
        # Free memory
        del chunk
        gc.collect()
    
    # Convert to DataFrame for easier analysis
    timestamps_df = pd.DataFrame({'timestamp': timestamps})
    
    # Extract date components
    timestamps_df['year'] = timestamps_df['timestamp'].dt.year
    timestamps_df['month'] = timestamps_df['timestamp'].dt.month
    timestamps_df['day'] = timestamps_df['timestamp'].dt.day
    timestamps_df['hour'] = timestamps_df['timestamp'].dt.hour
    timestamps_df['weekday'] = timestamps_df['timestamp'].dt.weekday
    
    # Create year-month column for trend analysis
    timestamps_df['year_month'] = timestamps_df['timestamp'].dt.to_period('M')
    
    # Monthly publication counts
    monthly_counts = timestamps_df['year_month'].value_counts().sort_index()
    monthly_counts = monthly_counts.reset_index()
    monthly_counts.columns = ['Month', 'Count']
    
    # Plot monthly trends
    plt.figure(figsize=(15, 6))
    plt.plot(monthly_counts['Month'].astype(str), monthly_counts['Count'])
    plt.title('Monthly Publication Trends')
    plt.xlabel('Month')
    plt.ylabel('Number of Articles')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig('monthly_publication_trends.png')
    plt.show()
    
    # Publication by day of week
    weekday_counts = timestamps_df['weekday'].value_counts().sort_index()
    weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    
    plt.figure(figsize=(10, 6))
    plt.bar(weekday_names, weekday_counts)
    plt.title('Publication by Day of Week')
    plt.xlabel('Day of Week')
    plt.ylabel('Number of Articles')
    plt.tight_layout()
    plt.savefig('weekday_publication_trends.png')
    plt.show()
    
    # Publication by hour of day
    hour_counts = timestamps_df['hour'].value_counts().sort_index()
    
    plt.figure(figsize=(12, 6))
    plt.bar(hour_counts.index, hour_counts.values)
    plt.title('Publication by Hour of Day')
    plt.xlabel('Hour (24-hour format)')
    plt.ylabel('Number of Articles')
    plt.xticks(range(0, 24))
    plt.tight_layout()
    plt.savefig('hourly_publication_trends.png')
    plt.show()
    
    return timestamps_df

# Analyze publication trends
time_analysis = analyze_publication_trends()

## 8. Topic Modeling

In [None]:
# This section requires additional memory and processing power
# Uncomment and run if your system can handle it


# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def perform_topic_modeling(n_topics=10, n_top_words=15):
    """Perform topic modeling using LDA"""
    # This requires loading all data into memory
    # Consider using a subset if memory is limited
    
    # Concatenate all lemmatized tokens into documents
    documents = []
    
    num_chunks = len([f for f in os.listdir('intermediate_data') if f.startswith('nlp_chunk_')])
    
    # Use a subset if data is too large
    max_docs = 10000  # Adjust based on memory constraints
    doc_count = 0
    
    for i in range(num_chunks):
        if doc_count >= max_docs:
            break
            
        # Load chunk
        file_path = f'intermediate_data/nlp_chunk_{i}.pkl'
        print(f"Loading {file_path} for topic modeling...")
        chunk = pd.read_pickle(file_path)
        
        # Convert lemmatized tokens to documents
        if 'lemmatized' in chunk.columns:
            chunk_docs = [' '.join(tokens) for tokens in chunk['lemmatized']]
            documents.extend(chunk_docs[:max_docs-doc_count])
            doc_count += len(chunk_docs[:max_docs-doc_count])
        
        # Free memory
        del chunk
        gc.collect()
    
    print(f"Performing topic modeling on {len(documents)} documents...")
    
    # Create document-term matrix
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=10000)
    dtm = vectorizer.fit_transform(documents)
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Fit LDA model
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=42,
        max_iter=5,  # Reduce for memory constraints
        n_jobs=-1
    )
    
    lda.fit(dtm)
    
    # Print topics
    for topic_idx, topic in enumerate(lda.components_):
        print(f"Topic #{topic_idx+1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))
        print()
    
    return lda, vectorizer, feature_names

# Perform topic modeling
lda_model, vectorizer, feature_names = perform_topic_modeling()

## 9. Save Final Results

In [None]:
# Save analysis results
results = {
    'most_common_words': most_common_words,
    'most_common_tags': most_common_tags,
    'most_prolific_authors': most_prolific_authors,
    'most_common_bigrams': most_common_bigrams,
    'most_common_trigrams': most_common_trigrams
}

# Save results as pickle
with open('analysis_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print("Analysis results saved to 'analysis_results.pkl'")

## 10. Summary

In [None]:
print("NLP Analysis of Medium Articles - Summary")
print("==========================================\n")

# Load results
with open('analysis_results.pkl', 'rb') as f:
    results = pickle.load(f)

print("Top 10 Most Common Words:")
for word, count in results['most_common_words'][:10]:
    print(f"  {word}: {count}")
print()

print("Top 10 Most Common Tags:")
for tag, count in results['most_common_tags'][:10]:
    print(f"  {tag}: {count}")
print()

print("Top 10 Most Prolific Authors:")
for author, count in results['most_prolific_authors'][:10]:
    print(f"  {author}: {count} articles")
print()

print("Top 10 Most Common Bigrams:")
for bigram, count in results['most_common_bigrams'][:10]:
    print(f"  {bigram}: {count}")
print()

print("Top 10 Most Common Trigrams:")
for trigram, count in results['most_common_trigrams'][:10]:
    print(f"  {trigram}: {count}")
print()

print("Analysis completed successfully!")

In [12]:
# Directory where your intermediate chunks are stored
import os
intermediate_folder = 'intermediate_data'

# Identify all NLP processed chunk files
chunk_files = sorted([
    f for f in os.listdir(intermediate_folder)
    if f.startswith('nlp_chunk_') and f.endswith('.pkl')
])

print(f"Found {len(chunk_files)} NLP chunks to merge.")

# Initialize list to collect DataFrames
all_chunks = []

# Load and append each chunk
for i, file in enumerate(chunk_files):
    file_path = os.path.join(intermediate_folder, file)
    print(f"Loading {file_path}...")

    chunk_df = pd.read_pickle(file_path)
    chunk_df.drop(columns=['title', 'text', 'tokens', 'title_tokens', 'title_lemmatized','lemmatized'], inplace=True)
    all_chunks.append(chunk_df)

    # Clear memory from last chunk
    del chunk_df
    gc.collect()

# Concatenate all chunks
final_df = pd.concat(all_chunks, ignore_index=True)
print(f"Final merged DataFrame shape: {final_df.shape}")

# Optional: drop duplicate articles if needed
# final_df.drop_duplicates(subset=['title', 'url'], inplace=True)

# Save the final dataset as a pickle (fast & preserves Python objects)
final_df.to_pickle('final_nlp_data.pkl')
print("Saved final merged data to 'final_nlp_data.pkl'.")

# Optionally also save as CSV (if you want to inspect or use outside Python)
final_df.to_csv('final_nlp_data.csv', index=False)

Found 20 NLP chunks to merge.
Loading intermediate_data\nlp_chunk_0.pkl...
Loading intermediate_data\nlp_chunk_1.pkl...
Loading intermediate_data\nlp_chunk_10.pkl...
Loading intermediate_data\nlp_chunk_11.pkl...
Loading intermediate_data\nlp_chunk_12.pkl...
Loading intermediate_data\nlp_chunk_13.pkl...
Loading intermediate_data\nlp_chunk_14.pkl...
Loading intermediate_data\nlp_chunk_15.pkl...
Loading intermediate_data\nlp_chunk_16.pkl...
Loading intermediate_data\nlp_chunk_17.pkl...
Loading intermediate_data\nlp_chunk_18.pkl...
Loading intermediate_data\nlp_chunk_19.pkl...
Loading intermediate_data\nlp_chunk_2.pkl...
Loading intermediate_data\nlp_chunk_3.pkl...
Loading intermediate_data\nlp_chunk_4.pkl...
Loading intermediate_data\nlp_chunk_5.pkl...
Loading intermediate_data\nlp_chunk_6.pkl...
Loading intermediate_data\nlp_chunk_7.pkl...
Loading intermediate_data\nlp_chunk_8.pkl...
Loading intermediate_data\nlp_chunk_9.pkl...
Final merged DataFrame shape: (192363, 8)
Saved final merged

## 11. Cleanup

In [None]:
# Uncomment to clean up intermediate files
'''
import shutil

def cleanup_intermediate_files():
    """Remove intermediate files to free up disk space"""
    print("Cleaning up intermediate files...")
    if os.path.exists('intermediate_data'):
        shutil.rmtree('intermediate_data')
        print("Intermediate files removed.")

# cleanup_intermediate_files()
'''