# Util - Generate Training Dataset
* StelllarAlgo Data Science
* Ryan Kazmerik & Nakisa Rad
* May 7, 2022

In [1]:
import getpass
import pandas as pd
import pyodbc

### Create a connection to MSSQL PROD:

In [2]:
# connect to SQL Server.
SERVER = '34.206.73.189' 
DATABASE = 'datascience' 
USERNAME = 'dsAdminWrite' 
PASSWORD = getpass.getpass(prompt='Enter your password')
CNXN = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+SERVER+';DATABASE='+DATABASE+';UID='+USERNAME+';PWD='+ PASSWORD)

### Let's create a list of databases we would like to use to generate the dataset:

In [39]:
TEAMS = [
    {
            "mssql_dbname": "stlrMLS",
            "lkupclientid": "31",
            "clientcode": "sacfc",
            "train_year": 2021,
            "test_year": 2022
        },
        {   
            "mssql_dbname": "stlrUSLLocomotive",
            "lkupclientid": "99",
            "clientcode": "usllocomotive",
            "train_year": 2021,
            "test_year": 2022
        }
]

### Now we can run our stored proc to get retention training and inference datasets:

In [40]:
print(f"GETTING TEAM DATASETS:")

team_datasets = []
for team in TEAMS:

    cursor = CNXN.cursor()

    storedProc = (
        f"""Exec {team["mssql_dbname"]}.[ds].[getRetentionScoringModelData] {team["lkupclientid"]}"""
    )

    df = pd.read_sql(storedProc, CNXN)

    df["year"] = pd.to_numeric(df["year"])
    df = df[df["year"] <= team["train_year"]]

    print(f" > ADDING TEAM TO DATASET: {team['clientcode']}")

    CNXN.commit()
    cursor.close()

    team_datasets.append(df)

print(f"TOTAL TEAMS IN DATASET: {len(team_datasets)}")

GETTING TEAM DATASETS:
 > ADDING TEAM TO DATASET: sacfc
 > ADDING TEAM TO DATASET: usllocomotive
TOTAL TEAMS IN DATASET: 2


### Let's create one dataframe for all team datasets and see how much data we have in total and for each year:

In [41]:
df_dataset = pd.concat(team_datasets)

print(df_dataset.shape)
print(df_dataset.year.value_counts())

(12836, 54)
2021    3544
2019    2447
2017    2440
2018    2292
2020    2096
2016      17
Name: year, dtype: int64


### Let's partition this dataframe into 4 equal sized dataframes:

In [42]:
batch_size = int(len(df_dataset) / 4) # create 4 equal batches
df_batches = [df_dataset[i:i + batch_size] for i in range(0, df_dataset.shape[0], batch_size)]
df_batches = df_batches[:4] # cut off straglers in last df

print(f"BATCH 1 SIZE: {len(df_batches[0])}")
print(f"BATCH 2 SIZE: {len(df_batches[1])}")
print(f"BATCH 3 SIZE: {len(df_batches[2])}")
print(f"BATCH 4 SIZE: {len(df_batches[3])}")

BATCH 1 SIZE: 3209
BATCH 2 SIZE: 3209
BATCH 3 SIZE: 3209
BATCH 4 SIZE: 3209


### Now we can convert these dataframes to parquet files and write them to a directory:

In [38]:
for idx, df_batch in enumerate(df_batches):

    df_batch.to_parquet(f"./exports/000{idx}_part_00.parquet")

### Done