# Imports

In [1]:
import mysql.connector
import pandas as pd
from os import getenv
from sqlalchemy import create_engine

# Helper functions

In [2]:
def connect_to_database(
    user_fill: str, database_fill: str, password_fill: str, host_fill: str
) -> mysql.connector.connection.MySQLConnection:
    """
    Connect to a database using the provided credentials.

    Args:
        user_fill (str): The username for the database connection.
        database_fill (str): The name of the database to connect to.
        password_fill (str): The password for the database user.
        host_fill (str): The host address of the database.

    Returns:
        mysql.connector.connection.MySQLConnection: A connection to the database.

    """

    return mysql.connector.connect(
        user=user_fill,
        database=database_fill,
        password=password_fill,
        host=host_fill,
        connect_timeout=10,
    )

def check_given_var(env_var_str: str) -> str:
    """
    Check if the given environment variable is set and return its value.

    Args:
        env_var_str (str): The name of the environment variable to check.

    Returns:
        str: The value of the environment variable.

    Raises:
        AssertionError: If the environment variable is not found.
    """

    env_var = getenv(env_var_str)
    assert (
        env_var is not None
    ), f"{env_var_str} is required but not found in environment variables"
    return env_var


def check_env_vars() -> (str, str, str, str):  # type: ignore
    user = check_given_var("DBL_USER")
    database = check_given_var("DBL_DATABASE")
    password = check_given_var("DBL_PASSWORD")
    host = check_given_var("DBL_HOST")
    return user, database, password, host

# Constants

In [3]:
USER, DATABASE, PASSWORD, HOST = check_env_vars()
# Test database runs waaaaaaaaaaaaaaaaaay faster, yet slow
USER, DATABASE = "nezox2um_test", "nezox2um_test"
query = """
SELECT 
    Users.user_id AS user_id, 
    Users.creation_time AS user_creation_time, 
    Users.verified,
    Users.followers_count,
    Users.friends_count,
    Users.statuses_count,
    Users.default_profile,
    Users.default_profile_image,
    Tweets.creation_time AS tweet_creation_time,
    Tweets.tweet_id,
    Tweets.full_text,
    Tweets.lang,
    Tweets.country_code,
    Tweets.favorite_count,
    Tweets.retweet_count,
    Tweets.possibly_sensitive,
    Tweets.replied_tweet_id,
    Tweets.reply_count,
    Tweets.quoted_status_id,
    Tweets.quote_count,
    Tweets.category
FROM Users
INNER JOIN Tweets ON Users.user_id = Tweets.user_id;

"""

# Loading

In [4]:
def fetch_data(query):
    engine = create_engine(f"mysql://{USER}:{PASSWORD}@{HOST}:3306/{DATABASE}")
    return pd.read_sql_query(query, engine)
test_data = fetch_data(query)

In [5]:
# Might be handy for splitting the data
# def fetch_data_by_columns(table):
#     # Prepare list to store each column DataFrame
#     engine = create_engine(f"mysql://{USER}:{PASSWORD}@{HOST}:3306/{DATABASE}")
#     column_data_frames = []
#     column_names = pd.read_sql_query(f"SHOW COLUMNS FROM {table};", engine)
    
#     for column in column_names["Field"]:
#         # Fetch each column data as a DataFrame
#         query = f"SELECT {column} FROM {table};"
#         column_data_frames.append(pd.read_sql_query(query, engine))

#     return column_data_frames

# # Example usage
# df = fetch_data_by_columns("Users")

In [6]:
test_data

Unnamed: 0,user_id,user_creation_time,verified,followers_count,friends_count,statuses_count,default_profile,default_profile_image,tweet_creation_time,tweet_id,...,lang,country_code,favorite_count,retweet_count,possibly_sensitive,replied_tweet_id,reply_count,quoted_status_id,quote_count,category
0,1000002812087099394,2018-05-25 13:17:04,0,111,77,9938,1,0,2019-06-07 14:31:48,1137004232559800320,...,en,,0,0,0,,0,1136861799075201024,0,retweet
1,1000003777070444544,2018-05-25 13:20:54,0,52,49,303,1,0,2019-05-24 13:57:03,1131922058940567553,...,ja,,0,0,0,,0,,0,tweet
2,1000004316550320128,2018-05-25 13:23:03,0,2644,1967,4846,1,0,2019-05-24 17:26:15,1131974705286733824,...,en,,0,0,0,,0,,0,retweet
3,1000004765336600582,2018-05-25 13:24:50,0,520,703,880,1,0,2019-06-07 19:08:24,1137073843447783424,...,en,,0,0,0,,0,1136861799075201024,0,retweet
4,1000005433426407426,2018-05-25 13:27:29,0,344,435,39978,1,0,2019-06-08 13:03:15,1137344337141018624,...,pt,,0,0,0,,0,1136861799075201024,0,retweet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,999997331880710144,2018-05-25 12:55:17,0,220,257,50855,0,0,2019-06-07 12:26:17,1136972644581072896,...,fr,,0,0,0,,0,1136861799075201024,0,retweet
399996,999998048251105280,2018-05-25 12:58:08,0,44,367,7381,0,0,2019-06-07 20:02:26,1137087439888093187,...,en,,0,0,0,,0,1136861799075201024,0,retweet
399997,999998128572043264,2018-05-25 12:58:27,0,268,366,13280,1,0,2019-06-07 16:16:51,1137030670532980736,...,en,,0,0,0,,0,1136861799075201024,0,retweet
399998,999999082314764290,2018-05-25 13:02:15,0,171,267,16782,1,0,2019-06-07 10:41:24,1136946252766154752,...,en,,0,0,0,,0,1136861799075201024,0,retweet


In [7]:
test_data.memory_usage(index=False, deep=True)

user_id                   27458835
user_creation_time         3200000
verified                   3200000
followers_count            3200000
friends_count              3200000
statuses_count             3200000
default_profile            3200000
default_profile_image      3200000
tweet_creation_time        3200000
tweet_id                  30399659
full_text                111283140
lang                      23609494
country_code              22823156
favorite_count             3200000
retweet_count              3200000
possibly_sensitive         3200000
replied_tweet_id          14753546
reply_count                3200000
quoted_status_id          15626294
quote_count                3200000
category                  25206850
dtype: int64

In [8]:
test_data.memory_usage(index=False, deep=True).sum()

312760974

In [9]:
test_data.dtypes

user_id                          object
user_creation_time       datetime64[ns]
verified                          int64
followers_count                   int64
friends_count                     int64
statuses_count                    int64
default_profile                   int64
default_profile_image             int64
tweet_creation_time      datetime64[ns]
tweet_id                         object
full_text                        object
lang                             object
country_code                     object
favorite_count                    int64
retweet_count                     int64
possibly_sensitive                int64
replied_tweet_id                 object
reply_count                       int64
quoted_status_id                 object
quote_count                       int64
category                         object
dtype: object

In [10]:
print(test_data["tweet_id"].apply(len).max())
print(test_data["tweet_id"].apply(len).min())
print("")
print(test_data["user_id"].apply(len).max())
print(test_data["user_id"].apply(len).min())
print("")
print(test_data["full_text"].apply(len).max())
print(test_data["full_text"].apply(len).min())
print(test_data["lang"].apply(len).max())
print(test_data["lang"].apply(len).min())
print(test_data["country_code"].apply(len).min())
print(test_data["country_code"].apply(len).max())
test_data["replied_tweet_id"].fillna("", inplace=True)
print(test_data["replied_tweet_id"].apply(len).min())
print(test_data["replied_tweet_id"].apply(len).max())
test_data["quoted_status_id"].fillna("", inplace=True)
print(test_data["quoted_status_id"].apply(len).min())
print(test_data["quoted_status_id"].apply(len).max())

# Anomaly
# Calculate string lengths
for user_id in test_data["user_id"]:
    if len(user_id) >= 20:
        print(user_id)
test_data["user_id"]

19
10

19
3

943
1
3
2
0
2
0
19
0
19


0         1000002812087099394
1         1000003777070444544
2         1000004316550320128
3         1000004765336600582
4         1000005433426407426
                 ...         
399995     999997331880710144
399996     999998048251105280
399997     999998128572043264
399998     999999082314764290
399999     999999669001510912
Name: user_id, Length: 400000, dtype: object

In [11]:
test_data.describe(include="all")

Unnamed: 0,user_id,user_creation_time,verified,followers_count,friends_count,statuses_count,default_profile,default_profile_image,tweet_creation_time,tweet_id,...,lang,country_code,favorite_count,retweet_count,possibly_sensitive,replied_tweet_id,reply_count,quoted_status_id,quote_count,category
count,400000.0,400000,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000,400000.0,...,400000,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000
unique,240934.0,,,,,,,,,400000.0,...,51,138.0,,,,69298.0,,7495.0,,3
top,22536055.0,,,,,,,,,1.1370042325598004e+18,...,en,,,,,,,,,retweet
freq,10101.0,,,,,,,,,1.0,...,304357,388422.0,,,,300888.0,,284106.0,,203425
mean,,2013-07-17 11:09:29.400167424,0.089512,91857.93,5927.002,116984.9,0.472417,0.0,2019-06-02 00:30:40.585792256,,...,,,16.17031,5.948103,0.007777,,1.273427,,0.0,
min,,2006-05-24 15:51:44,0.0,0.0,0.0,1.0,0.0,0.0,2009-06-22 10:47:25,,...,,,0.0,0.0,0.0,,0.0,,0.0,
25%,,2010-08-04 03:27:41,0.0,100.0,164.0,1893.0,0.0,0.0,2019-05-29 14:25:27.750000128,,...,,,0.0,0.0,0.0,,0.0,,0.0,
50%,,2013-02-09 14:13:02,0.0,372.0,409.0,9622.0,0.0,0.0,2019-06-04 18:17:26,,...,,,0.0,0.0,0.0,,0.0,,0.0,
75%,,2016-07-22 00:27:35,0.0,1561.0,1134.0,40854.75,1.0,0.0,2019-06-07 19:26:51,,...,,,0.0,0.0,0.0,,0.0,,0.0,
max,,2019-06-09 16:41:03,1.0,60516180.0,4541583.0,3113732.0,1.0,0.0,2019-06-09 17:20:24,,...,,,1254216.0,448031.0,1.0,,169843.0,,0.0,


In [12]:
# Memory optimisation (later should be done with parsing)
test_data["verified"] = test_data["verified"].astype(bool)
test_data["possibly_sensitive"] = test_data["possibly_sensitive"].astype(bool)
test_data["default_profile"] = test_data["default_profile"].astype(bool)
test_data["default_profile_image"] = test_data["default_profile_image"].astype(bool)
test_data["category"] = test_data["category"].astype("category")

In [13]:
test_data.memory_usage(index=False, deep=True)

user_id                   27458835
user_creation_time         3200000
verified                    400000
followers_count            3200000
friends_count              3200000
statuses_count             3200000
default_profile             400000
default_profile_image       400000
tweet_creation_time        3200000
tweet_id                  30399659
full_text                111283140
lang                      23609494
country_code              22823156
favorite_count             3200000
retweet_count              3200000
possibly_sensitive          400000
replied_tweet_id          24682850
reply_count                3200000
quoted_status_id          25001792
quote_count                3200000
category                    400296
dtype: int64