# 🧠 CORD-19 Research Metadata Analysis
Author: **Santos Paul Amiani**

This notebook explores and visualizes metadata from the CORD-19 dataset.

In [None]:

# --- PART 1: Import Libraries and Load Data ---
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Load dataset
df = pd.read_csv('metadata.csv')

print("✅ Dataset Loaded Successfully!")
print("Shape:", df.shape)
df.head()


In [None]:

# --- PART 2: Basic Exploration ---
print("Data Info:")
print(df.info())

print("\nMissing Values (Top 15 Columns):")
print(df.isnull().sum().sort_values(ascending=False).head(15))

print("\nSummary Statistics:")
print(df.describe())


In [None]:

# --- PART 3: Data Cleaning and Preparation ---
df_clean = df.dropna(subset=['title', 'publish_time'])
df_clean['publish_time'] = pd.to_datetime(df_clean['publish_time'], errors='coerce')
df_clean['year'] = df_clean['publish_time'].dt.year
df_clean['abstract_word_count'] = df_clean['abstract'].fillna('').apply(lambda x: len(x.split()))
print("✅ Cleaned Data Summary:")
print(df_clean.info())


In [None]:

# --- PART 4: Publications by Year ---
plt.figure(figsize=(8, 5))
year_counts = df_clean['year'].value_counts().sort_index()
sns.barplot(x=year_counts.index, y=year_counts.values)
plt.title('Publications by Year')
plt.xlabel('Year')
plt.ylabel('Number of Publications')
plt.xticks(rotation=45)
plt.show()


In [None]:

# --- PART 5: Top Journals Publishing COVID-19 Research ---
top_journals = df_clean['journal'].value_counts().head(10)
plt.figure(figsize=(8, 5))
sns.barplot(y=top_journals.index, x=top_journals.values)
plt.title('Top 10 Journals Publishing COVID-19 Research')
plt.xlabel('Number of Publications')
plt.ylabel('Journal')
plt.show()


In [None]:

# --- PART 6: Word Cloud of Paper Titles ---
text = ' '.join(df_clean['title'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Frequent Words in Paper Titles')
plt.show()


In [None]:

# --- PART 7: Paper Counts by Source ---
source_counts = df_clean['source_x'].value_counts().head(10)
plt.figure(figsize=(8, 5))
sns.barplot(y=source_counts.index, x=source_counts.values)
plt.title('Top 10 Sources of Papers')
plt.xlabel('Number of Papers')
plt.ylabel('Source')
plt.show()


In [None]:

# --- PART 8: Save Cleaned Dataset ---
df_clean.to_csv('metadata_cleaned.csv', index=False)
print("✅ Cleaned dataset saved as 'metadata_cleaned.csv'")



## 🧩 Reflections

- Publications increased significantly after 2020 due to the global COVID-19 response.  
- Major journals such as *medRxiv* and *bioRxiv* contributed heavily.  
- Frequent keywords include 'COVID-19', 'SARS-CoV-2', and 'infection'.  
- Handling missing data and cleaning publication dates were key challenges.  
- This project strengthened understanding of data cleaning, visualization, and storytelling.
