## Importing Libraries

In [105]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
import time

## Postgres Configuration

In [106]:
%run config_psql.ipynb

## Settings Configuration

In [107]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Getting missing data from database

In [108]:
query = """SELECT id, name, full_name FROM dwh.player_info WHERE bowling_hand IS NULL LIMIT 7000"""

In [109]:
with engine.connect() as conn:
    df = pd.read_sql_query(query, con = engine)
len(df)

119

In [99]:
pw_file_path = os.path.abspath(os.path.join(os.getcwd(), "../../../config/"))
pw = pd.read_json(pw_file_path + '\\PasswordManager.json', typ = 'series')
GOOGLE_API_KEY = pw['google_api_key']

In [100]:
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-pro')

In [101]:
prompt_desc = f"Given below are details of few cricket players. The task is to mention the bowling hand style of that particular cricket player.\n"
prompt_desc += f"The output for each player should be either of 2 results - 'Left' or 'Right'. In case of unknown, answer them as 'Unknown'.\n"
prompt_desc += f"The input format is as follows. A single string input will be given consisting of n players each seperated by a comma(,). To correctly identify and understand the particular cricketer each player input will consist of three elements seperated by '-'. the first element is a player id.  the second element is player name, the third element is an alternate full name of the same player. Input example for 3 players is shown below:\n\n"
prompt_desc += f"123-Rohit Sharma-Rohit Gurunath Sharma,444-K Yadav-Kuldeep Yadav,619-DLPA-Dua Lipa\n\n"
prompt_desc += f"Provide the output strictly in the following manner - 'player1_id-player1_bowling_hand,player2_id-player2_bowling_hand,....'\n"
prompt_desc += f"For the above example input the expected output should be as follows : \n\n"
prompt_desc += f"123-Right,444-Left,619-Unknown\n\n"
prompt_desc += f"The Input is provided below : \n\n"

In [102]:
out = {}
batch_count = 1
for i in range(0, len(df), 30):
    tick = time.time()
    if i + 30 > len(df):
        batch = df.iloc[i:]
    else:
        batch = df.iloc[i:i+30]
    prompt = prompt_desc + ','.join(df.apply(lambda row: '-'.join(map(str, row)), axis=1))
    response = model.generate_content(prompt)
    tock = time.time()

    print("batch {} response received. batch length - {}. Time taken (s) - {}".format(batch_count, len(batch), tock-tick))
    print("storing entries now for batch {}".format(batch_count), end = '\n\n')

    for i in response.text.split(','):
        if 'Unknown' not in i:
            out[i.split("-")[0].strip()] = i.split("-")[-1].strip()
        else:
            out[i.split("-")[0].strip()] = np.nan

batch 1 response received. batch length - 30. Time taken (s) - 16.487510204315186
storing entries now for batch 1

batch 1 response received. batch length - 30. Time taken (s) - 15.267125129699707
storing entries now for batch 1

batch 1 response received. batch length - 30. Time taken (s) - 15.86735224723816
storing entries now for batch 1

batch 1 response received. batch length - 37. Time taken (s) - 14.738007307052612
storing entries now for batch 1



In [104]:
df_result = pd.DataFrame(list(out.items()), columns=['id', 'bowling_hand'])

query = df_result.to_sql("temp_table", con = engine, schema = "stg", method = "multi", if_exists="replace", index=False)
print(query)

query = """
UPDATE dwh.player_info pi
SET bowling_hand = tt.bowling_hand
FROM stg.temp_table tt
WHERE pi.id = tt.id::SMALLINT;
"""
with engine.connect() as conn:
    conn.execute("DELETE FROM stg.temp_table WHERE bowling_hand NOT IN ('Right', 'Left')")
    conn.execute(query)
    conn.execute("DROP TABLE IF EXISTS stg.temp_table")

127
