In [1]:
# import statements, don't know if I'll need all of them

import os
import csv
import json
import numpy
import datetime
import pandas as pd
import urllib.request
import pprint

import pymongo
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

In [2]:
host_name = "localhost"
host_ip = "127.0.0.1"
port = "3306"
user_id = "root2"
pwd = "123456789"

src_dbname = "dp1updated"
dst_dbname = "dp1mongoupdates"

### Define Functions for Getting Data From and Setting Data Into Databases

In [3]:
def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

In [4]:
try:
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)

    sqlEngine.execute(f"DROP DATABASE IF EXISTS `{dst_dbname}`;")
    sqlEngine.execute(f"CREATE DATABASE `{dst_dbname}`;")
    sqlEngine.execute(f"USE {dst_dbname};")
except:
    print("There was an error creating a sql engine from the information provided in the program")

### CSV to SQL conversion

In [5]:
try:
    data_dir = os.path.join(os.getcwd(), 'data')
    data_file = os.path.join(data_dir, 'superbowl.csv')
    # df = pd.read_csv(data_file, header=0, index_col=0)
    df = pd.read_csv(data_file)
    total_rows = len(df.axes[0]) #===> Axes of 0 is for a row
    total_cols = len(df.axes[1]) #===> Axes of 1 is for a column
    print("Number of Rows: " + str(total_rows))
    print("Number of Columns: " + str(total_cols))
    df.head()
except:
    print("The file could not be read in correctly.")
    print("Make sure the data file is in the correct directory so it can be inserted properly.")

Number of Rows: 54
Number of Columns: 10


In [6]:
# Add Super Bowl 55
new_row = pd.DataFrame({'Date':'Feb 7 2021', 'SB':'LV (55)', 'Winner':'Tampa Bay Buccaneers', 'Winner Pts':'31',
                        'Loser':'Kansas City Chiefs', 'Loser Pts':'9', 'MVP':'Tom Brady',
                        'Stadium':'Raymond James Stadium', 'City':'Tampa', 'State':'Florida'}, index = [0])
df = pd.concat([new_row, df]).reset_index(drop = True)
df.head(5)

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State
0,Feb 7 2021,LV (55),Tampa Bay Buccaneers,31,Kansas City Chiefs,9,Tom Brady,Raymond James Stadium,Tampa,Florida
1,Feb 2 2020,LIV (54),Kansas City Chiefs,31,San Francisco 49ers,20,Patrick Mahomes,Hard Rock Stadium,Miami Gardens,Florida
2,Feb 3 2019,LIII (53),New England Patriots,13,Los Angeles Rams,3,Julian Edelman,Mercedes-Benz Stadium,Atlanta,Georgia
3,Feb 4 2018,LII (52),Philadelphia Eagles,41,New England Patriots,33,Nick Foles,U.S. Bank Stadium,Minneapolis,Minnesota
4,Feb 5 2017,LI (51),New England Patriots,34,Atlanta Falcons,28,Tom Brady,NRG Stadium,Houston,Texas


In [7]:
# Add Super Bowl 56
new_row2 = pd.DataFrame({'Date':'Feb 13 2022', 'SB':'LVI (56)', 'Winner':'Los Angeles Rams', 'Winner Pts':'23', 
                          'Loser':'Cincinnati Bengals', 'Loser Pts':'20','MVP':'Cooper Kupp', 'Stadium':'SoFi Stadium', 
                            'City':'Inglewood', 'State':'California'}, index = [0])
df = pd.concat([new_row2, df]).reset_index(drop = True)

df.head(5)

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State
0,Feb 13 2022,LVI (56),Los Angeles Rams,23,Cincinnati Bengals,20,Cooper Kupp,SoFi Stadium,Inglewood,California
1,Feb 7 2021,LV (55),Tampa Bay Buccaneers,31,Kansas City Chiefs,9,Tom Brady,Raymond James Stadium,Tampa,Florida
2,Feb 2 2020,LIV (54),Kansas City Chiefs,31,San Francisco 49ers,20,Patrick Mahomes,Hard Rock Stadium,Miami Gardens,Florida
3,Feb 3 2019,LIII (53),New England Patriots,13,Los Angeles Rams,3,Julian Edelman,Mercedes-Benz Stadium,Atlanta,Georgia
4,Feb 4 2018,LII (52),Philadelphia Eagles,41,New England Patriots,33,Nick Foles,U.S. Bank Stadium,Minneapolis,Minnesota


In [8]:
# drop useless column
df.drop('State', axis=1, inplace=True)
df.head(5)

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City
0,Feb 13 2022,LVI (56),Los Angeles Rams,23,Cincinnati Bengals,20,Cooper Kupp,SoFi Stadium,Inglewood
1,Feb 7 2021,LV (55),Tampa Bay Buccaneers,31,Kansas City Chiefs,9,Tom Brady,Raymond James Stadium,Tampa
2,Feb 2 2020,LIV (54),Kansas City Chiefs,31,San Francisco 49ers,20,Patrick Mahomes,Hard Rock Stadium,Miami Gardens
3,Feb 3 2019,LIII (53),New England Patriots,13,Los Angeles Rams,3,Julian Edelman,Mercedes-Benz Stadium,Atlanta
4,Feb 4 2018,LII (52),Philadelphia Eagles,41,New England Patriots,33,Nick Foles,U.S. Bank Stadium,Minneapolis


In [9]:
start_val = 1
df.insert(loc=0, column = 'surr_key', value = range(start_val, len(df) + start_val))
df.head(5)

Unnamed: 0,surr_key,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City
0,1,Feb 13 2022,LVI (56),Los Angeles Rams,23,Cincinnati Bengals,20,Cooper Kupp,SoFi Stadium,Inglewood
1,2,Feb 7 2021,LV (55),Tampa Bay Buccaneers,31,Kansas City Chiefs,9,Tom Brady,Raymond James Stadium,Tampa
2,3,Feb 2 2020,LIV (54),Kansas City Chiefs,31,San Francisco 49ers,20,Patrick Mahomes,Hard Rock Stadium,Miami Gardens
3,4,Feb 3 2019,LIII (53),New England Patriots,13,Los Angeles Rams,3,Julian Edelman,Mercedes-Benz Stadium,Atlanta
4,5,Feb 4 2018,LII (52),Philadelphia Eagles,41,New England Patriots,33,Nick Foles,U.S. Bank Stadium,Minneapolis


In [10]:
set_dataframe(user_id, pwd, host_name, "dp1updated", df, "superbowls", "surr_key", "insert")

### Convert from SQL to MongoDB Database

In [11]:
# host_name = "localhost"
ports = {"mongo" : 27017, "mysql" : 3306}

# user_id = "root2"
# pwd = "123456789"
 
src_dbname = "dp1mongoupdates"
dst_dbname = "dp1updated"

In [12]:
def get_sql_dataframe(user_id, pwd, host_name, db_name, sql_query):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    
    '''Invoke the pd.read_sql() function to query the database, and fill a Pandas DataFrame.'''
    conn = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, conn);
    conn.close()
    
    return dframe


def get_mongo_dataframe(user_id, pwd, host_name, port, db_name, collection, query):
    '''Create a connection to MongoDB, with or without authentication credentials'''
    if user_id and pwd:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db_name)
        client = pymongo.MongoClient(mongo_uri)
    else:
        conn_str = f"mongodb://{host_name}:{port}/"
        client = pymongo.MongoClient(conn_str)
    
    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = client[db_name]
    dframe = pd.DataFrame(list(db[collection].find(query)))
    dframe.drop(['_id'], axis=1, inplace=True)
    client.close()
    
    return dframe


def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the Pandas DataFrame .to_sql( ) function to either create, or append to, a table'''
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

#### RUN THIS ONLY ONCE (OR ELSE MONGO WILL HAVE DUPLICATE ENTRIES)

In [13]:
# only run this when you are going back through the project again after running the cell below once
port = ports["mongo"]
conn_str = f"mongodb://{host_name}:{port}/"
client = pymongo.MongoClient(conn_str)
db = client[src_dbname]

In [None]:
# NOTE FOR MYSELF: DO NOT RUN ANY MORE, ALREADY IN MONGO

port = ports["mongo"]
conn_str = f"mongodb://{host_name}:{port}/"
client = pymongo.MongoClient(conn_str)
db = client[src_dbname]

data_dir = os.path.join(os.getcwd(), 'data')

json_files = {"SBstats" : 'superbowl_json.json'
             }

for file in json_files:
    json_file = os.path.join(data_dir, json_files[file])
    with open(json_file, 'r') as openfile:
        json_object = json.load(openfile)
        file = db[file]
        result = file.insert_many(json_object)
        #print(f"{file} was successfully loaded.")

        
client.close()     

#### Transformations within MongoDB

In [14]:
query = {}
port = ports["mongo"]
collection = "SBstats"

df_mongo_orig = get_mongo_dataframe(None, None, host_name, port, src_dbname, collection, query)
df_mongo_orig.head(5)

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State
0,Feb 7 2016,50,Denver Broncos,24,Carolina Panthers,10,Von Miller,Levi's Stadium,Santa Clara,California
1,Jan 15 1967,I (1),Green Bay Packers,35,Kansas City Chiefs,10,Bart Starr+,Memorial Coliseum,Los Angeles,California
2,Jan 14 1968,II (2),Green Bay Packers,33,Oakland Raiders,14,Bart Starr+,Orange Bowl,Miami,Florida
3,Jan 12 1969,III (3),New York Jets,16,Baltimore Colts,7,Joe Namath+,Orange Bowl,Miami,Florida
4,Jan 11 1970,IV (4),Kansas City Chiefs,23,Minnesota Vikings,7,Len Dawson+,Tulane Stadium,New Orleans,Louisiana


In [15]:
sql_cowboys = "SELECT * FROM dp1updated.superbowls WHERE `Winner` = 'Dallas Cowboys';"
df_cowboys_sb = get_sql_dataframe(user_id, pwd, host_name, src_dbname, sql_cowboys)
df_cowboys_sb.head(5)

Unnamed: 0,surr_key,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City
0,27,Jan 28 1996,XXX (30),Dallas Cowboys,27,Pittsburgh Steelers,17,Larry Brown,Sun Devil Stadium,Tempe
1,29,Jan 30 1994,XXVIII (28),Dallas Cowboys,30,Buffalo Bills,13,Emmitt Smith+,Georgia Dome,Atlanta
2,30,Jan 31 1993,XXVII (27),Dallas Cowboys,52,Buffalo Bills,17,Troy Aikman+,Rose Bowl,Pasadena
3,45,Jan 15 1978,XII (12),Dallas Cowboys,27,Denver Broncos,10,Harvey MartinRandy White+,Superdome,New Orleans
4,51,Jan 16 1972,VI (6),Dallas Cowboys,24,Miami Dolphins,3,Roger Staubach+,Tulane Stadium,New Orleans


In [16]:
db.list_collection_names()

['SBstats']

In [17]:
collection = "SBstats"

stats = db[collection]

In [18]:
# The SELECT list -----------------------------------------------
projection = {"_id": 0, "Date": 1, "SB": 1, "Winner": 1, "Winner Pts": 1, "Loser": 1, "Loser Pts": 1, "MVP": 1,
             "Stadium": 1, "City": 1, "State": 1}

# The WHERE clause ----------------------------------------------
conditions = {"Winner":{"$eq": "Dallas Cowboys"}}

# The ORDER BY clause -------------------------------------------
orderby = [("Date", -1)]

for title in stats.find(conditions, projection).sort(orderby):
    print(title)

{'Date': 'Jan 31 1993', 'SB': 'XXVII (27)', 'Winner': 'Dallas Cowboys', 'Winner Pts': 52, 'Loser': 'Buffalo Bills', 'Loser Pts': 17, 'MVP': 'Troy Aikman+', 'Stadium': 'Rose Bowl', 'City': 'Pasadena', 'State': 'California'}
{'Date': 'Jan 30 1994', 'SB': 'XXVIII (28)', 'Winner': 'Dallas Cowboys', 'Winner Pts': 30, 'Loser': 'Buffalo Bills', 'Loser Pts': 13, 'MVP': 'Emmitt Smith+', 'Stadium': 'Georgia Dome', 'City': 'Atlanta', 'State': 'Georgia'}
{'Date': 'Jan 28 1996', 'SB': 'XXX (30)', 'Winner': 'Dallas Cowboys', 'Winner Pts': 27, 'Loser': 'Pittsburgh Steelers', 'Loser Pts': 17, 'MVP': 'Larry Brown', 'Stadium': 'Sun Devil Stadium', 'City': 'Tempe', 'State': 'Arizona'}
{'Date': 'Jan 16 1972', 'SB': 'VI (6)', 'Winner': 'Dallas Cowboys', 'Winner Pts': 24, 'Loser': 'Miami Dolphins', 'Loser Pts': 3, 'MVP': 'Roger Staubach+', 'Stadium': 'Tulane Stadium', 'City': 'New Orleans', 'State': 'Louisiana'}
{'Date': 'Jan 15 1978', 'SB': 'XII (12)', 'Winner': 'Dallas Cowboys', 'Winner Pts': 27, 'Loser':

In [19]:
df_cowboys_sb_mongo = pd.DataFrame( list( stats.find(conditions, projection).sort(orderby) ) )
df_cowboys_sb_mongo.head(5)

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State
0,Jan 31 1993,XXVII (27),Dallas Cowboys,52,Buffalo Bills,17,Troy Aikman+,Rose Bowl,Pasadena,California
1,Jan 30 1994,XXVIII (28),Dallas Cowboys,30,Buffalo Bills,13,Emmitt Smith+,Georgia Dome,Atlanta,Georgia
2,Jan 28 1996,XXX (30),Dallas Cowboys,27,Pittsburgh Steelers,17,Larry Brown,Sun Devil Stadium,Tempe,Arizona
3,Jan 16 1972,VI (6),Dallas Cowboys,24,Miami Dolphins,3,Roger Staubach+,Tulane Stadium,New Orleans,Louisiana
4,Jan 15 1978,XII (12),Dallas Cowboys,27,Denver Broncos,10,Harvey MartinRandy White+,Superdome,New Orleans,Louisiana


#### Do more things in MongoDB before pushing back to new SQL Schema from Modified SQL Schema

In [20]:
src_dbname2 = "dp1updated"
dst_dbname2 = "dp1mongoupdates"

In [None]:
# NOTE FOR MYSELF: DO NOT RUN ANY MORE, ALREADY IN MONGO

port = ports["mongo"]
conn_str = f"mongodb://{host_name}:{port}/"
client = pymongo.MongoClient(conn_str)
db2 = client[src_dbname2]

data_dir = os.path.join(os.getcwd(), 'data')

json_files = {"updated_sb_stats" : 'updated_superbowls_as_json.json'
             }

for file in json_files:
    json_file = os.path.join(data_dir, json_files[file])
    with open(json_file, 'r') as openfile:
        json_object = json.load(openfile)
        file = db2[file]
        result = file.insert_many(json_object)
        print(f"{file} was successfully loaded.")

        
client.close()

In [21]:
query = {}
port = ports["mongo"]
collection = "updated_sb_stats"

conn_str = f"mongodb://{host_name}:{port}/"
client = pymongo.MongoClient(conn_str)
db2 = client[src_dbname2]

try:
    df_mongo_updated = get_mongo_dataframe(None, None, host_name, port, src_dbname2, collection, query)
    df_mongo_updated.head(5)
except:
    print("There was an error getting the MongoDB dataframe.")
    print("Make sure the dataframe exists and other arguments are correct.")    

In [22]:
sql_cowboys2 = "SELECT * FROM dp1updated.superbowls WHERE `Winner` = 'Dallas Cowboys';"
df_cowboys_sb2 = get_sql_dataframe(user_id, pwd, host_name, dst_dbname2, sql_cowboys2)
df_cowboys_sb2.head(5)

Unnamed: 0,surr_key,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City
0,27,Jan 28 1996,XXX (30),Dallas Cowboys,27,Pittsburgh Steelers,17,Larry Brown,Sun Devil Stadium,Tempe
1,29,Jan 30 1994,XXVIII (28),Dallas Cowboys,30,Buffalo Bills,13,Emmitt Smith+,Georgia Dome,Atlanta
2,30,Jan 31 1993,XXVII (27),Dallas Cowboys,52,Buffalo Bills,17,Troy Aikman+,Rose Bowl,Pasadena
3,45,Jan 15 1978,XII (12),Dallas Cowboys,27,Denver Broncos,10,Harvey MartinRandy White+,Superdome,New Orleans
4,51,Jan 16 1972,VI (6),Dallas Cowboys,24,Miami Dolphins,3,Roger Staubach+,Tulane Stadium,New Orleans


In [23]:
db2.list_collection_names()

['updated_sb_stats', 'SBstats']

In [24]:
collection2 = "updated_sb_stats"

stats2 = db2[collection2]

In [25]:
# The SELECT list -----------------------------------------------
projection2 = {"_id": 0, "surr_key": 1, "Date": 1, "SB": 1, "Winner": 1, "Winner Pts": 1, "Loser": 1,
               "Loser Pts": 1, "MVP": 1, "Stadium": 1, "City": 1}

# The WHERE clause ----------------------------------------------
conditions = {"Winner":{"$eq": "Dallas Cowboys"}}

# The ORDER BY clause -------------------------------------------
orderby = [("Date", 1)]

for title in stats2.find(conditions, projection2).sort(orderby):
    print(title)

{'surr_key': 17, 'SB': 'XII (12)', 'Date': 'Jan 15 1978', 'Winner': 'Dallas Cowboys', 'Winner Pts': 27, 'Loser': 'Denver Broncos', 'Loser Pts': 10, 'MVP': 'Harvey MartinRandy White+', 'Stadium': 'Superdome', 'City': 'New Orleans'}
{'surr_key': 12, 'SB': 'VI (6)', 'Date': 'Jan 16 1972', 'Winner': 'Dallas Cowboys', 'Winner Pts': 24, 'Loser': 'Miami Dolphins', 'Loser Pts': 3, 'MVP': 'Roger Staubach+', 'Stadium': 'Tulane Stadium', 'City': 'New Orleans'}
{'surr_key': 45, 'SB': 'XXX (30)', 'Date': 'Jan 28 1996', 'Winner': 'Dallas Cowboys', 'Winner Pts': 27, 'Loser': 'Pittsburgh Steelers', 'Loser Pts': 17, 'MVP': 'Larry Brown', 'Stadium': 'Sun Devil Stadium', 'City': 'Tempe'}
{'surr_key': 44, 'SB': 'XXVIII (28)', 'Date': 'Jan 30 1994', 'Winner': 'Dallas Cowboys', 'Winner Pts': 30, 'Loser': 'Buffalo Bills', 'Loser Pts': 13, 'MVP': 'Emmitt Smith+', 'Stadium': 'Georgia Dome', 'City': 'Atlanta'}
{'surr_key': 43, 'SB': 'XXVII (27)', 'Date': 'Jan 31 1993', 'Winner': 'Dallas Cowboys', 'Winner Pts': 

In [26]:
df_cowboys_sb_mongo_updated = pd.DataFrame( list( stats2.find(conditions, projection2).sort(orderby) ) )
df_cowboys_sb_mongo_updated.head(5)

Unnamed: 0,surr_key,SB,Date,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City
0,17,XII (12),Jan 15 1978,Dallas Cowboys,27,Denver Broncos,10,Harvey MartinRandy White+,Superdome,New Orleans
1,12,VI (6),Jan 16 1972,Dallas Cowboys,24,Miami Dolphins,3,Roger Staubach+,Tulane Stadium,New Orleans
2,45,XXX (30),Jan 28 1996,Dallas Cowboys,27,Pittsburgh Steelers,17,Larry Brown,Sun Devil Stadium,Tempe
3,44,XXVIII (28),Jan 30 1994,Dallas Cowboys,30,Buffalo Bills,13,Emmitt Smith+,Georgia Dome,Atlanta
4,43,XXVII (27),Jan 31 1993,Dallas Cowboys,52,Buffalo Bills,17,Troy Aikman+,Rose Bowl,Pasadena


In [27]:
# insert into sql schema again

dataframe = df_cowboys_sb_mongo_updated
table_name = 'updated_sb_stats'
primary_key = 'surr_key'
db_operation = "insert"

try:
    set_dataframe(user_id, pwd, host_name, dst_dbname2, dataframe, table_name, primary_key, db_operation)
except:
    print("There was an error setting the new dataframe.")
    print("Make sure the destination database, primary key, and other arguments are correct.")    