<a href="https://colab.research.google.com/github/slp22/data-engineering-project/blob/main/engineering_monkeypox_mvp_sql.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Data Engineering | MVP

# Monkeypox Tweets

## Imports

In [1]:
import json
import logging
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import os, shutil, itertools
import pandas as pd
import pathlib as Path
import pickle
import PIL
import random
import seaborn as sns
import sklearn as sk
import warnings
import zipfile

from sqlite3 import connect


## 1 | Research Design


* **Research Question:** How well can a neural network diagnose diabetic retinopathy from a retinal image?
* **Impact Hypothesis:** Accelerate the National Eye Institute’s research evaluation of retinal clinical trial data, and streamline publishing results.
* **Data source:** [Diabetic Retinopathy 2015 Data Colored Resized](https://www.kaggle.com/datasets/sovitrath/diabetic-retinopathy-2015-data-colored-resized) , n=35,126
* **Error metric:** Accuracy

* **Data Dictionary:**
  * Classes = 5 stages of diabetic retinopathy:
    * **Normal eye**
    * **Mild** Nonproliferative Retinopathy: Microaneurysms are visbile, small areas of balloon-like swelling in the retina's tiny blood vessels.
    * **Moderate** Nonproliferative Retinopathy: Some blood vessels that nourish the retina are blocked.
    * **Severe** Nonproliferative Retinopathy: More blocked blood vessels, depriving several areas of the retina of blood supply; retina sends signals to the body to grow new blood vessels for nourishment.
    * **Proliferative** Retinopathy: Advanced stage; new blood vessels are abnormal and fragile; grow along the retina and along the surface of the clear, vitreous gel that fills the inside of the eye.


## 2 | Dataset: [Monkeypox Tweets](https://www.kaggle.com/datasets/aneeshtickoo/tweets-on-monkeypox)

### Data Download

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
os.chdir('/content')
os.listdir()

['.config',
 'default.db',
 'monkeypox.csv',
 '.ipynb_checkpoints',
 ':memory',
 'tweets-on-monkeypox.zip',
 'drive',
 'tweets.pkl',
 'kaggle.json',
 'sample_data']

In [4]:
with open('kaggle.json') as json_file:
    kaggle = json.load(json_file)

In [5]:
# assign to directory 
os.environ['KAGGLE_CONFIG_DIR'] = "/content"

In [6]:
# download dataset from kaggle
'chmod 600 /content/kaggle.json'
! kaggle datasets download -d aneeshtickoo/tweets-on-monkeypox

tweets-on-monkeypox.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
# unzip kaggle file
zip_ref = zipfile.ZipFile('tweets-on-monkeypox.zip', 'r') 
zip_ref.extractall('/content')
zip_ref.close()

## 3 | Exploratory Data Analysis

In [8]:
df = pd.read_csv('/content/monkeypox.csv')

In [9]:
df.head(2)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1555815462201872385,1555815462201872385,2022-08-06 12:48:06 India Standard Time,2022-08-06,12:48:06,530,820113517613154304,thetenth2022,TheTenth,,...,,,,,,[],,,,
1,1555815458602831872,1555815458602831872,2022-08-06 12:48:05 India Standard Time,2022-08-06,12:48:05,530,196518052,ashemedai,Jeroen Ruigrok van der Werven,,...,,,,,,[],,,,


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6859 entries, 0 to 6858
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               6859 non-null   int64  
 1   conversation_id  6859 non-null   int64  
 2   created_at       6859 non-null   object 
 3   date             6859 non-null   object 
 4   time             6859 non-null   object 
 5   timezone         6859 non-null   int64  
 6   user_id          6859 non-null   int64  
 7   username         6859 non-null   object 
 8   name             6859 non-null   object 
 9   place            2 non-null      object 
 10  tweet            6859 non-null   object 
 11  language         6859 non-null   object 
 12  mentions         6859 non-null   object 
 13  urls             6859 non-null   object 
 14  photos           6859 non-null   object 
 15  replies_count    6859 non-null   int64  
 16  retweets_count   6859 non-null   int64  
 17  likes_count   

In [11]:
df.describe()

Unnamed: 0,id,conversation_id,timezone,user_id,replies_count,retweets_count,likes_count,video,near,geo,source,user_rt_id,user_rt,retweet_id,retweet_date,translate,trans_src,trans_dest
count,6859.0,6859.0,6859.0,6859.0,6859.0,6859.0,6859.0,6859.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,1.555764e+18,1.555192e+18,530.0,7.224884e+17,0.399038,0.691938,3.053944,0.095932,,,,,,,,,,
std,25414980000000.0,1.168003e+16,0.0,6.620987e+17,1.789475,11.879167,40.567536,0.29452,,,,,,,,,,
min,1.555725e+18,8.510359e+17,530.0,39893.0,0.0,0.0,0.0,0.0,,,,,,,,,,
25%,1.555742e+18,1.55572e+18,530.0,490658200.0,0.0,0.0,0.0,0.0,,,,,,,,,,
50%,1.55576e+18,1.555746e+18,530.0,8.977771e+17,0.0,0.0,0.0,0.0,,,,,,,,,,
75%,1.555784e+18,1.555774e+18,530.0,1.395422e+18,0.0,0.0,1.0,0.0,,,,,,,,,,
max,1.555815e+18,1.555815e+18,530.0,1.555798e+18,68.0,740.0,2180.0,1.0,,,,,,,,,,


In [12]:
cols_list = list(df.columns)
cols_list

['id',
 'conversation_id',
 'created_at',
 'date',
 'time',
 'timezone',
 'user_id',
 'username',
 'name',
 'place',
 'tweet',
 'language',
 'mentions',
 'urls',
 'photos',
 'replies_count',
 'retweets_count',
 'likes_count',
 'hashtags',
 'cashtags',
 'link',
 'retweet',
 'quote_url',
 'video',
 'thumbnail',
 'near',
 'geo',
 'source',
 'user_rt_id',
 'user_rt',
 'retweet_id',
 'reply_to',
 'retweet_date',
 'translate',
 'trans_src',
 'trans_dest']

In [13]:
df['tweet']

0       So has anyone begun to compile 'here's the sta...
1       Getting some groceries, topic shifted to covid...
2       "Illinois Children's Daycare Worker Tests Posi...
3       Illinois daycare worker tests positive for mon...
4       @Natrone86 @JunotIsrael @elcavaqueen @thechicc...
                              ...                        
6854    @hurtmeknots With Monkeypox on the rise, maybe...
6855    @FITNESSSF can you start filling up the disinf...
6856       @politvidchannel 71% already have monkey pox .
6857                            RT if you have #monkeypox
6858    @POTUS @VP 🥰 Dems 🇺🇸💙 are having the best jobs...
Name: tweet, Length: 6859, dtype: object

* RangeIndex: 6859 entries, 0 to 6858
* Data columns (total 36 columns)

### Select corpus: `tweet_df`

In [14]:
tweet_df = df[['language','date','username','hashtags','tweet']]
tweet_df.head(2)

Unnamed: 0,language,date,username,hashtags,tweet
0,en,2022-08-06,thetenth2022,[],So has anyone begun to compile 'here's the sta...
1,en,2022-08-06,ashemedai,[],"Getting some groceries, topic shifted to covid..."


In [33]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6410 entries, 0 to 6858
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   date      6410 non-null   object
 1   username  6410 non-null   object
 2   hashtags  6410 non-null   object
 3   tweet     6410 non-null   object
dtypes: object(4)
memory usage: 250.4+ KB


In [34]:
tweet_df['username'].nunique()

5634

In [18]:
tweet_df['language'].unique()


array(['en', 'kn', 'de', 'in', 'tl', 'fr', 'pt', 'te', 'tr', 'it', 'qme',
       'bn', 'qht', 'pl', 'es', 'el', 'nl', 'cy', 'ta', 'hi', 'sv', 'ja',
       'th', 'mr', 'et', 'gu', 'da', 'ro', 'ml', 'zxx', 'und', 'pa', 'ur',
       'ko', 'am', 'fi', 'zh', 'lt', 'hu', 'ru', 'ar', 'si'], dtype=object)

In [19]:
print('English entries:', (tweet_df[tweet_df["language"] == 'en'].count())['language'])
print('Spanish entries:', (tweet_df[tweet_df["language"] == 'es'].count())['language'])
print('Italian entries:', (tweet_df[tweet_df["language"] == 'it'].count())['language'])


English entries: 6410
Spanish entries: 51
Italian entries: 10


In [20]:
tweet_df = tweet_df[(tweet_df['language'] == 'en')]
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6410 entries, 0 to 6858
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   language  6410 non-null   object
 1   date      6410 non-null   object
 2   username  6410 non-null   object
 3   hashtags  6410 non-null   object
 4   tweet     6410 non-null   object
dtypes: object(5)
memory usage: 300.5+ KB


In [21]:
tweet_df.head(2)

Unnamed: 0,language,date,username,hashtags,tweet
0,en,2022-08-06,thetenth2022,[],So has anyone begun to compile 'here's the sta...
1,en,2022-08-06,ashemedai,[],"Getting some groceries, topic shifted to covid..."


In [22]:
tweet_df = tweet_df.drop(columns=['language'])
tweet_df.head(2)

Unnamed: 0,date,username,hashtags,tweet
0,2022-08-06,thetenth2022,[],So has anyone begun to compile 'here's the sta...
1,2022-08-06,ashemedai,[],"Getting some groceries, topic shifted to covid..."


In [36]:
# save corpus selection as tweet_df
tweet_df.to_pickle('/content/tweets.pkl')
tweet_df.to_csv(r'/content/tweets.csv', index=False)

## DF_to_SQL

## df_to_sql tweet_df #1

In [24]:
# from sqlite3 import connect
# conn = connect(':memory') 

In [25]:
# # 1. read in csv 
# df = pd.read_csv('/content/drive/MyDrive/tweet_df.csv')
# df.head(2)

In [26]:
# # 2. df_to_sql
# sql = df.to_sql('mpox_db', conn, if_exists='replace')
# sql

In [27]:
# # 3. read_sql 
# mpox = pd.read_sql('SELECT * FROM mpox_db', conn) 
# mpox.head(2)

In [28]:
# type(mpox)

## df_to_sql tweet_df #2

In [29]:
# https://towardsdatascience.com/have-a-sql-interview-coming-up-ace-it-using-google-colab-6d3c0ffb29dc

def pd_to_sqlDB(input_df: pd.DataFrame,
                table_name: str,
                db_name: str = 'default.db') -> None:

    '''Take a Pandas dataframe `input_df` and upload it to `table_name` SQLITE table
    Args:
        input_df (pd.DataFrame): Dataframe containing data to upload to SQLITE
        table_name (str): Name of the SQLITE table to upload to
        db_name (str, optional): Name of the SQLITE Database in which the table is created. 
                                 Defaults to 'default.db'.
    '''

    # Step 1: Setup local logging
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s: %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')

    # Step 2: Find columns in the dataframe
    cols = input_df.columns
    cols_string = ','.join(cols)
    val_wildcard_string = ','.join(['?'] * len(cols))

    # Step 3: Connect to a DB file if it exists, else create a new file
    con = sqlite3.connect(db_name)
    cur = con.cursor()
    logging.info(f'SQL DB {db_name} created')

    # Step 4: Create Table
    sql_string = f"""CREATE TABLE {table_name} ({cols_string});"""
    cur.execute(sql_string)
    logging.info(f'SQL Table {table_name} created with {len(cols)} columns')

    # Step 5: Upload the dataframe
    rows_to_upload = input_df.to_dict(orient='split')['data']
    sql_string = f"""INSERT INTO {table_name} ({cols_string}) VALUES ({val_wildcard_string});"""    
    cur.executemany(sql_string, rows_to_upload)
    logging.info(f'{len(rows_to_upload)} rows uploaded to {table_name}')
  
    # Step 6: Commit the changes and close the connection
    con.commit()
    con.close()

In [30]:
#  https://towardsdatascience.com/have-a-sql-interview-coming-up-ace-it-using-google-colab-6d3c0ffb29dc

def sql_query_to_pd(sql_query_string: str, db_name: str ='mpox.db') -> pd.DataFrame:
    '''Execute an SQL query and return the results as a pandas dataframe
    Args:
        sql_query_string (str): SQL query string to execute
        db_name (str, optional): Name of the SQLITE Database to execute the query in.
                                 Defaults to 'default.db'.
    Returns:
        pd.DataFrame: Results of the SQL query in a pandas dataframe
    '''    
    # Step 1: Connect to the SQL DB
    con = sqlite3.connect(db_name)

    # Step 2: Execute the SQL query
    cursor = con.execute(sql_query_string)

    # Step 3: Fetch the data and column names
    result_data = cursor.fetchall()
    cols = [description[0] for description in cursor.description]

    # Step 4: Close the connection
    con.close()

    # Step 5: Return as a dataframe
    return pd.DataFrame(result_data, columns=cols)

In [39]:
# https://towardsdatascience.com/have-a-sql-interview-coming-up-ace-it-using-google-colab-6d3c0ffb29dc

# Step 1: Read the csv file into a dataframe
input_df = pd.read_csv('/content/tweets.csv')

# Step 2: Upload the dataframe to a SQL Table
pd_to_sqlDB(input_df,
            table_name='tweets',
            db_name='monkeypox.db')

# Step 3: Write the SQL query in a string variable
sql_query_string = """
    SELECT *
    FROM tweets
"""
# Step 4: Exectue the SQL query
result_df = sql_query_to_pd(sql_query_string, db_name='monkeypox.db')
result_df

Unnamed: 0,date,username,hashtags,tweet
0,2022-08-06,thetenth2022,[],So has anyone begun to compile 'here's the sta...
1,2022-08-06,ashemedai,[],"Getting some groceries, topic shifted to covid..."
2,2022-08-06,democracymotion,[],"""Illinois Children's Daycare Worker Tests Posi..."
3,2022-08-06,thegoogle93,[],Illinois daycare worker tests positive for mon...
4,2022-08-06,bufflosouljah1,[],@Natrone86 @JunotIsrael @elcavaqueen @thechicc...
...,...,...,...,...
6405,2022-08-06,arkcowgirl62,[],"@hurtmeknots With Monkeypox on the rise, maybe..."
6406,2022-08-06,speehanagram,"['monkeypox', 'besafe']",@FITNESSSF can you start filling up the disinf...
6407,2022-08-06,pritzkertoilet,[],@politvidchannel 71% already have monkey pox .
6408,2022-08-06,thumbressler,['monkeypox'],RT if you have #monkeypox


database = monkeypox.db
table1 = tweets