In [None]:
! uv add groq
#!pip install Groq

In [None]:
import groq
from groq import Groq
import os
from getpass import getpass

In [None]:
print(groq.__version__)

In [None]:
from pprint import pprint as pp
# Set a custom width for pretty-printing
def pprint(data, width=80):
    """Pretty print data with a specified width."""
    pp(data, width=width)# List of model identifiers to query


### Setting Up API Keys

In [None]:
# Enter API key
api_key = getpass('Enter your Groq API key: ')

In [None]:
# Configure the default for all requests:
client = Groq(
    max_retries=2,# default is 2
    api_key=api_key,
     # 20 seconds (default is 1 minute)
    timeout=50.0,
)

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "generate a sample codemeta.json file",
        }
    ],
    model="llama3-8b-8192",
)
print(chat_completion.choices[0].message.content)

In [None]:
#Check the above generate Schema is valid on this website and edit it there only before pasting it in codemeta.json.
#https://jsonlint.com/

### Exploratory Data Analysis 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read the 04_medals.csv file and do some analysis on the medals data.

In [None]:
# Load the medals dataset
# Replace 'medals.csv' with the actual path to your file
df = pd.read_csv('../01_data/04_medals.csv')
print(f"Shape: {df.shape}")
df.head(3)

### Display basic information about the dataset

In [None]:

print("\n1. DATASET OVERVIEW")
print("-" * 30)
print(f"Total records: {len(df)}")
print(f"Unique disciplines: {df['discipline'].nunique()}")
print(f"Unique gender: {df['gender'].nunique()}")
print(f"Unique gender: {df['gender'].value_counts()}")
print(f"Countries with medals: {df['country'].nunique()}")
print(f"Medal distribution:")
print(df['medal_type'].value_counts())

### Display the country with the most medals and its medal count

In [None]:
# Count medals per country
#medal_counts = df['country'].value_counts()
# Group by country and count medals
medal_counts = df.groupby('country').size()
print(medal_counts.tail(4))
# Find the country with the most medals
top_country = medal_counts.idxmax()
top_count = medal_counts.max()

print(f"The country with the most medals is {top_country} with {top_count} medals.")

### Display Top 5 Medal-Winning Countries

In [None]:
# Get top 5 countries
top_5 = medal_counts.sort_values(ascending=False).head(5)

# Plotting
plt.figure(figsize=(10, 6))
top_5.plot(kind='bar', color='gold', edgecolor='black')

plt.title('Top 5 Medal-Winning Countries', fontsize=16)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Number of Medals', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

### Visualization of Top 5 Countries with max medals by medal type.

In [None]:
#Visualization of Top 5 Countries with max medals by medal type.

# Create a pivot table: rows = countries, columns = medal types
medal_pivot = df.pivot_table(index='country', columns='medal_type', aggfunc='size', fill_value=0)

# Get top 5 countries by total medal count
top_5 = medal_pivot.sum(axis=1).sort_values(ascending=False).head(5)
top_5_medals = medal_pivot.loc[top_5.index]

# Plotting grouped bar chart
medal_types = ['Gold Medal', 'Silver Medal', 'Bronze Medal']
colors = ['#FFD700', '#C0C0C0', '#CD7F32']
bar_width = 0.25
x = range(len(top_5_medals))

plt.figure(figsize=(10, 6))

# Plot each medal type as a separate set of bars
for i, medal in enumerate(medal_types):
    plt.bar(
        [p + i * bar_width for p in x],
        top_5_medals[medal] if medal in top_5_medals.columns else [0] * len(top_5_medals),
        width=bar_width,
        label=medal.replace(' Medal', ''),
        color=colors[i]
    )

# Formatting
plt.xlabel('Country', fontsize=12)
plt.ylabel('Number of Medals', fontsize=12)
plt.title('Top 5 Medal-Winning Countries by Medal Type', fontsize=16)
plt.xticks([p + bar_width for p in x], top_5_medals.index, rotation=45)
plt.legend(title='Medal Type')
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

### Top 3 Sports where women won the most medals

In [None]:
#| label: celllabel1
# Filter only female athletes
women_df = df[df['gender'] == 'W']

# Group by discipline and count medals
top_sports = women_df['discipline'].value_counts().head(3)

# Plotting
plt.figure(figsize=(8, 5))
top_sports.plot(kind='bar', color='hotpink', edgecolor='black')

plt.title('Top 3 Sports Where Women Won the Most Medals', fontsize=14)
plt.xlabel('Sport (Discipline)', fontsize=12)
plt.ylabel('Number of Medals', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


### Display the sport with the most gold medals won by women

In [None]:
# Filter for gold medals won by women
gold_women_df = df[(df['gender'] == 'W') & (df['medal_type'] == 'Gold Medal')]

# Count gold medals by discipline
gold_by_sport = gold_women_df['discipline'].value_counts()

# Get the sport with the most gold medals
top_sport = gold_by_sport.idxmax()
top_count = gold_by_sport.max()

print(f"The sport with the most gold medals won by women is '{top_sport}' with {top_count} gold medals.")

In [None]:
# Instructions for Groq 
instructions = '''
Analyze the following Olympic medals data showing women's gold medal counts by sport:

{data_summary}

Based on this data:
1. Which sport has the most gold medals won by women?
2. What is the exact count of gold medals for that sport?
3. List the top 3 sports where women won the most gold medals with their counts.
4. Provide any interesting insights about women's performance across these sports.
5. Name top 5 medal winning countries.

Please provide a clear, concise analysis focusing on the sport with the highest women's gold medal count per sport.

'''

In [None]:
groq_response=''
try:
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": instructions,
            },
        ],
        model="llama3-8b-8192",
        temperature=0.1,  # Low temperature for factual analysis
        max_tokens=500
    )
    print("=== GROQ API RESPONSE ===")
    groq_response = chat_completion.choices[0].message.content
    print(groq_response)
except groq.APIConnectionError as e:
    print("The server could not be reached")
    print(e.__cause__)  # an underlying Exception, likely raised within httpx.
except groq.RateLimitError as e:
    print("A 429 status code was received; we should back off a bit.")
except groq.APIStatusError as e:
    print("Another non-200-range status code was received")
    print(e.status_code)
    print(e.response)

In [None]:
#| label: celllabel2
print(groq_response)

### Generate text for citation.cff file for Github

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Generate a citation.cff file for this repository",
        }
    ],
    model="llama3-8b-8192",
)
print(chat_completion.choices[0].message.content)

In [None]:
#Alternatively generate using this online tool : https://citation-file-format.github.io/cff-initializer-javascript/#/