# Data Quality: IS 477 Final Project


Sarah Wheeler & Megan Sia

## References:

Google. (2025). Google Colaboratory. Retrieved December 7, 2025, from https://colab.research.google.com/

Mathieu, E., Ritchie, H., Ortiz-Ospina, E. et al. A global database of COVID-19 vaccinations. Natural Humanities Behavior (2024). https://doi.org/10.1038/s41562-021-01122-8

Thakur, N. (2024). Five Years of COVID-19 Discourse on Instagram: A Labeled Instagram Dataset of Over Half a Million Posts for Multilingual Sentiment Analysis [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13896353

Thakur N., “Five Years of COVID-19 Discourse on Instagram: A Labeled Instagram Dataset of Over Half a Million Posts for Multilingual Sentiment Analysis”, Proceedings of the 7th International Conference on Machine Learning and Natural Language Processing (MLNLP 2024), Chengdu, China, October 18-20, 2024 (Paper accepted for publication, Preprint available at: https://arxiv.org/abs/2410.03293)

In [1]:

import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive') #We uploaded the datasets (find in the Box foler linked in our GitHub repo) to our drive, and accessed them from there.

Mounted at /content/drive


In [15]:
insta_df = pd.read_excel("/content/drive/My Drive/Dataset.xlsx") # (Thakur, 2025)
vaccine_df = pd.read_csv("/content/drive/My Drive/owid-covid-data.csv") # (Mathieu et al., 2024)
merged_df = pd.read_csv("/content/drive/My Drive/merged_df.csv")
global_daily = pd.read_csv("/content/drive/My Drive/global_daily.csv")

In [6]:
#Uncomment if running in an IDE other than Colab
#from pathlib import Path

#DATA_DIR = Path("data")

#insta_df = pd.read_excel(DATA_DIR / "Dataset.xlsx")
#vaccine_df = pd.read_csv(DATA_DIR / "owid-covid-data.csv")

In [8]:
#merged_df = pd.read_csv(DATA_DIR / "merged_df.csv")
#global_daily = pd.read_csv(DATA_DIR / "global_daily.csv")
#To find the merged_df.csv and global_daily.csv, go to the output_data folder and download "merged_df.csv" Place these files in the "data" subfolder

In [16]:
merged_df["Sentiment"].value_counts(normalize = True)

Unnamed: 0_level_0,proportion
Sentiment,Unnamed: 1_level_1
neutral,0.434008
positive,0.398245
negative,0.167746


In [17]:
merged_df["Full Language"].value_counts(normalize = True)

Unnamed: 0_level_0,proportion
Full Language,Unnamed: 1_level_1
English,0.749056
Hindi,0.036351
Spanish,0.036005
Tamil,0.024891
German,0.017557
...,...
Divehi,0.000003
Sundanese,0.000003
Kazakh,0.000003
Turkmen,0.000003


In [18]:
merged_df["new_vaccinations"].describe()

Unnamed: 0,new_vaccinations
count,363507.0
mean,12456450.0
std,13600170.0
min,0.0
25%,15687.0
50%,5524114.0
75%,23989740.0
max,47114180.0


In [19]:
daily_sentiment_counts = merged_df.groupby(['Date', 'Sentiment']).size().unstack(fill_value=0)
daily_sentiment_counts['total_tweets'] = daily_sentiment_counts.sum(axis=1)
daily_sentiment_counts['positive_proportion'] = daily_sentiment_counts['positive'] / daily_sentiment_counts['total_tweets']

daily_sentiment_proportions = daily_sentiment_counts.reset_index()

display(daily_sentiment_proportions.head())

Sentiment,Date,negative,neutral,positive,total_tweets,positive_proportion
0,2020-01-21,0,0,1,1,1.0
1,2020-01-22,6,0,1,7,0.142857
2,2020-01-23,2,1,1,4,0.25
3,2020-01-24,1,2,2,5,0.4
4,2020-01-25,7,11,5,23,0.217391


In [20]:
comparison_df = pd.merge(daily_sentiment_proportions, global_daily, left_on='Date', right_on='date', how='inner')
comparison_df = comparison_df[['Date', 'positive_proportion', 'new_vaccinations']]

display(comparison_df.head())

Unnamed: 0,Date,positive_proportion,new_vaccinations
0,2020-01-21,1.0,0.0
1,2020-01-22,0.142857,0.0
2,2020-01-23,0.25,0.0
3,2020-01-24,0.4,0.0
4,2020-01-25,0.217391,0.0


In [21]:
df = vaccine_df.copy()

num_cols = ['total_vaccinations','people_vaccinated','total_boosters','new_vaccinations']
for c in num_cols:
    if c in df:
        df[c] = pd.to_numeric(df[c], errors='coerce')

if 'location' in df:
    bad_groups = {
        'World','Africa','Asia','Europe','European Union','High income','Upper middle income',
        'Lower middle income','Low income','North America','South America','Oceania'
    }
    df = df[~df['location'].isin(bad_groups)]

display(df.head())
display(df.info())

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,


<class 'pandas.core.frame.DataFrame'>
Index: 346839 entries, 0 to 364169
Data columns (total 67 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   iso_code                                    346839 non-null  object 
 1   continent                                   346839 non-null  object 
 2   location                                    346839 non-null  object 
 3   date                                        346839 non-null  object 
 4   total_cases                                 308910 non-null  float64
 5   new_cases                                   337140 non-null  float64
 6   new_cases_smoothed                          335941 non-null  float64
 7   total_deaths                                287386 non-null  float64
 8   new_deaths                                  337191 non-null  float64
 9   new_deaths_smoothed                         336021 non-null  float64
 10  t

None

In [22]:
timestamp_col = 'report_timestamp'
if timestamp_col in df.columns:
    df = (df.sort_values(['location','date',timestamp_col])
            .drop_duplicates(['location','date'], keep='last'))
else:
    df = (df.groupby(['location','date'], as_index=False)
            .agg({
                'total_vaccinations':'max',
                'people_vaccinated':'max',
                'total_boosters':'max',
                'new_vaccinations':'max'
            }))

for c in ['total_vaccinations','people_vaccinated','total_boosters']:
    if c in df:
        df[c] = df.groupby('location')[c].cummax()

df = df.sort_values(['location','date'])
df['new_vacc_from_total'] = (
    df.groupby('location')['total_vaccinations'].diff().clip(lower=0)
)

if 'total_boosters' in df:
    df['new_boosters_from_total'] = (
        df.groupby('location')['total_boosters'].diff().clip(lower=0)
    )

display(df.head())
display(df.info())

Unnamed: 0,location,date,total_vaccinations,people_vaccinated,total_boosters,new_vaccinations,new_vacc_from_total,new_boosters_from_total
0,Afghanistan,2020-01-03,,,,,,
1,Afghanistan,2020-01-04,,,,,,
2,Afghanistan,2020-01-05,,,,,,
3,Afghanistan,2020-01-06,,,,,,
4,Afghanistan,2020-01-07,,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346839 entries, 0 to 346838
Data columns (total 8 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   location                 346839 non-null  object 
 1   date                     346839 non-null  object 
 2   total_vaccinations       67684 non-null   float64
 3   people_vaccinated        64230 non-null   float64
 4   total_boosters           37010 non-null   float64
 5   new_vaccinations         53683 non-null   float64
 6   new_vacc_from_total      53683 non-null   float64
 7   new_boosters_from_total  30665 non-null   float64
dtypes: float64(6), object(2)
memory usage: 21.2+ MB


None

In [23]:
global_daily = (
    df.groupby('date', as_index=False)
      .agg(
          total_vaccinations=('total_vaccinations','sum'),
          people_vaccinated=('people_vaccinated','sum'),
          total_boosters=('total_boosters','sum'),
          new_vaccinations=('new_vacc_from_total','sum')
      )
)

display(global_daily.head())
display(global_daily.info())
display(global_daily.describe())

Unnamed: 0,date,total_vaccinations,people_vaccinated,total_boosters,new_vaccinations
0,2020-01-01,0.0,0.0,0.0,0.0
1,2020-01-02,0.0,0.0,0.0,0.0
2,2020-01-03,0.0,0.0,0.0,0.0
3,2020-01-04,0.0,0.0,0.0,0.0
4,2020-01-05,0.0,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   date                1449 non-null   object 
 1   total_vaccinations  1449 non-null   float64
 2   people_vaccinated   1449 non-null   float64
 3   total_boosters      1449 non-null   float64
 4   new_vaccinations    1449 non-null   float64
dtypes: float64(4), object(1)
memory usage: 56.7+ KB


None

Unnamed: 0,total_vaccinations,people_vaccinated,total_boosters,new_vaccinations
count,1449.0,1449.0,1449.0,1449.0
mean,4107612000.0,1402494000.0,406254300.0,7489611.0
std,3668006000.0,1142931000.0,451129900.0,11352180.0
min,0.0,0.0,0.0,0.0
25%,3668165.0,3588923.0,0.0,3469.0
50%,3292783000.0,1400179000.0,285474400.0,1298606.0
75%,8249514000.0,2375760000.0,844405700.0,10705490.0
max,10381190000.0,4396591000.0,1930041000.0,47114180.0


In [24]:
daily_sentiment_counts = merged_df.groupby(['Date', 'Sentiment']).size().unstack(fill_value=0)
daily_sentiment_counts['total_tweets'] = daily_sentiment_counts.sum(axis=1)
daily_sentiment_counts['positive_proportion'] = daily_sentiment_counts['positive'] / daily_sentiment_counts['total_tweets']

daily_sentiment_proportions = daily_sentiment_counts.reset_index()

display(daily_sentiment_proportions.head())
display(daily_sentiment_proportions.info())

Sentiment,Date,negative,neutral,positive,total_tweets,positive_proportion
0,2020-01-21,0,0,1,1,1.0
1,2020-01-22,6,0,1,7,0.142857
2,2020-01-23,2,1,1,4,0.25
3,2020-01-24,1,2,2,5,0.4
4,2020-01-25,7,11,5,23,0.217391


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1428 entries, 0 to 1427
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 1428 non-null   object 
 1   negative             1428 non-null   int64  
 2   neutral              1428 non-null   int64  
 3   positive             1428 non-null   int64  
 4   total_tweets         1428 non-null   int64  
 5   positive_proportion  1428 non-null   float64
dtypes: float64(1), int64(4), object(1)
memory usage: 67.1+ KB


None

In [25]:
display(global_daily.head())
display(global_daily.tail())
display(global_daily.info())

Unnamed: 0,date,total_vaccinations,people_vaccinated,total_boosters,new_vaccinations
0,2020-01-01,0.0,0.0,0.0,0.0
1,2020-01-02,0.0,0.0,0.0,0.0
2,2020-01-03,0.0,0.0,0.0,0.0
3,2020-01-04,0.0,0.0,0.0,0.0
4,2020-01-05,0.0,0.0,0.0,0.0


Unnamed: 0,date,total_vaccinations,people_vaccinated,total_boosters,new_vaccinations
1444,2023-12-15,2310995000.0,1065453000.0,252736740.0,4047.0
1445,2023-12-16,2310995000.0,1065453000.0,252737162.0,498.0
1446,2023-12-17,2279397000.0,1055558000.0,244519171.0,162.0
1447,2023-12-18,2284120000.0,1055558000.0,244519251.0,102.0
1448,2023-12-19,0.0,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   date                1449 non-null   object 
 1   total_vaccinations  1449 non-null   float64
 2   people_vaccinated   1449 non-null   float64
 3   total_boosters      1449 non-null   float64
 4   new_vaccinations    1449 non-null   float64
dtypes: float64(4), object(1)
memory usage: 56.7+ KB


None

In [26]:
display(daily_sentiment_proportions.head())
display(daily_sentiment_proportions.tail())
display(daily_sentiment_proportions.info())

Sentiment,Date,negative,neutral,positive,total_tweets,positive_proportion
0,2020-01-21,0,0,1,1,1.0
1,2020-01-22,6,0,1,7,0.142857
2,2020-01-23,2,1,1,4,0.25
3,2020-01-24,1,2,2,5,0.4
4,2020-01-25,7,11,5,23,0.217391


Sentiment,Date,negative,neutral,positive,total_tweets,positive_proportion
1423,2023-12-14,47,206,69,322,0.214286
1424,2023-12-15,45,172,221,438,0.504566
1425,2023-12-16,50,122,76,248,0.306452
1426,2023-12-17,44,101,57,202,0.282178
1427,2023-12-18,71,184,223,478,0.466527


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1428 entries, 0 to 1427
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 1428 non-null   object 
 1   negative             1428 non-null   int64  
 2   neutral              1428 non-null   int64  
 3   positive             1428 non-null   int64  
 4   total_tweets         1428 non-null   int64  
 5   positive_proportion  1428 non-null   float64
dtypes: float64(1), int64(4), object(1)
memory usage: 67.1+ KB


None

In [27]:
display(comparison_df.head())
display(comparison_df.tail())
display(comparison_df.info())

Unnamed: 0,Date,positive_proportion,new_vaccinations
0,2020-01-21,1.0,0.0
1,2020-01-22,0.142857,0.0
2,2020-01-23,0.25,0.0
3,2020-01-24,0.4,0.0
4,2020-01-25,0.217391,0.0


Unnamed: 0,Date,positive_proportion,new_vaccinations
1423,2023-12-14,0.214286,6181.0
1424,2023-12-15,0.504566,4047.0
1425,2023-12-16,0.306452,498.0
1426,2023-12-17,0.282178,162.0
1427,2023-12-18,0.466527,102.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1428 entries, 0 to 1427
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 1428 non-null   object 
 1   positive_proportion  1428 non-null   float64
 2   new_vaccinations     1428 non-null   float64
dtypes: float64(2), object(1)
memory usage: 33.6+ KB


None

In [28]:
insta_df['Date'] = pd.to_datetime(insta_df['Date'], format='%m/%d/%Y')
global_daily['date'] = pd.to_datetime(global_daily['date'])
display(insta_df.info())
display(global_daily.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500153 entries, 0 to 500152
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   PostID            500153 non-null  object        
 1   Post Description  500153 non-null  object        
 2   Date              500153 non-null  datetime64[ns]
 3   Language Code     500153 non-null  object        
 4   Full Language     500153 non-null  object        
 5   Sentiment         500153 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 22.9+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date                1449 non-null   datetime64[ns]
 1   total_vaccinations  1449 non-null   float64       
 2   people_vaccinated   1449 non-null   float64       
 3   total_boosters      1449 non-null   float64       
 4   new_vaccinations    1449 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 56.7 KB


None

In [29]:
global_daily['cumulative_new_vaccinations'] = global_daily['new_vaccinations'].cumsum()

display(global_daily[['date', 'total_vaccinations', 'new_vaccinations', 'cumulative_new_vaccinations']].head())
display(global_daily[['date', 'total_vaccinations', 'new_vaccinations', 'cumulative_new_vaccinations']].tail())

Unnamed: 0,date,total_vaccinations,new_vaccinations,cumulative_new_vaccinations
0,2020-01-01,0.0,0.0,0.0
1,2020-01-02,0.0,0.0,0.0
2,2020-01-03,0.0,0.0,0.0
3,2020-01-04,0.0,0.0,0.0
4,2020-01-05,0.0,0.0,0.0


Unnamed: 0,date,total_vaccinations,new_vaccinations,cumulative_new_vaccinations
1444,2023-12-15,2310995000.0,4047.0,10852450000.0
1445,2023-12-16,2310995000.0,498.0,10852450000.0
1446,2023-12-17,2279397000.0,162.0,10852450000.0
1447,2023-12-18,2284120000.0,102.0,10852450000.0
1448,2023-12-19,0.0,0.0,10852450000.0
