In [None]:
!pip install transformers
!pip install gradio
!pip install matplotlib
!pip install plotly
!pip install pandas
!pip install numpy
!pip install wordcloud
!pip install -q accelerate
!pip install -q requests
!pip install -q protobuf
!pip install -q bitsandbytes
!pip install -q sentencepiece
!pip install -q safetensors

In [None]:
!pip install --upgrade torch

In [None]:
!pip install torchvision

In [None]:
# GitHub Profile Analyzer
# This notebook analyzes GitHub profiles
import os
import requests
import json
import gradio as gr
from datetime import datetime
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
import io
import plotly.express as px
import pandas as pd
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define GitHub API functions
def get_user_info(username):
    """Fetch GitHub user profile information"""
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return {"error": f"Failed to fetch user data: {response.status_code}"}

def get_user_repos(username):
    """Fetch GitHub user repositories"""
    url = f"https://api.github.com/users/{username}/repos?per_page=100&sort=updated"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return {"error": f"Failed to fetch repositories: {response.status_code}"}

def get_repo_languages(repo_url):
    """Fetch languages used in a repository"""
    response = requests.get(repo_url)
    if response.status_code == 200:
        return response.json()
    else:
        return {}

# Load model
def load_model():
    print("Loading model...")

    model_id = "google/gemma-1.1-2b-it"

    # Check if user is in Colab to apply optimizations
    try:
        import google.colab
        is_colab = True
    except:
        is_colab = False

    # First load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))

    # Configure model loading with optimizations
    load_config = {
        "torch_dtype": torch.float16,
        "low_cpu_mem_usage": True,
        "device_map": "auto"
    }

    if is_colab:
        # Further optimizations for Colab environment
        print("Running in Colab - applying additional optimizations")

    # Load the model with optimizations
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        token=os.environ.get("HF_TOKEN"),
        **load_config
    )

    print("Model loaded successfully!")
    return model, tokenizer

# Function to generate text with the loaded model
def generate_with_model(model, tokenizer, prompt, max_length=2048):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate text
    with torch.no_grad():
        generated_ids = model.generate(
            inputs.input_ids,
            max_length=max_length,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            repetition_penalty=1.15
        )

    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    # Extract just the generated part (after the prompt)
    generated_text = response[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
    return generated_text

# Create structured data from GitHub API results
def prepare_github_data(username):
    print(f"Fetching data for GitHub user: {username}")

    user_data = get_user_info(username)
    if "error" in user_data:
        return f"Error: {user_data['error']}"

    repos_data = get_user_repos(username)
    if isinstance(repos_data, dict) and "error" in repos_data:
        return f"Error: {repos_data['error']}"

    # Process repos data
    repos_info = []
    languages_count = {}
    repo_topics = []
    for repo in repos_data:
        # Skip forks unless they have significant changes
        if repo.get("fork", False) and repo.get("stargazers_count", 0) < 5:
            continue

        repo_info = {
            "name": repo["name"],
            "description": repo.get("description") or "No description provided",
            "stars": repo["stargazers_count"],
            "forks": repo["forks_count"],
            'created_at': datetime.strptime(repo['created_at'], '%Y-%m-%dT%H:%M:%SZ'),
            'updated_at': datetime.strptime(repo['updated_at'], '%Y-%m-%dT%H:%M:%SZ'),
            "url": repo["html_url"],
            "topics": repo.get("topics", []),
            'size': repo['size'],
            'language': repo['language'] if repo['language'] else 'Not specified',
            'has_issues': repo['has_issues'],
            'open_issues': repo['open_issues_count'],
            'is_fork': repo['fork'],
            'url': repo['html_url']
        }

        # Count languages
        if repo["language"]:
            languages_count[repo["language"]] = languages_count.get(repo["language"], 0) + 1

        # Collect repo topics
        if repo.get("topics"):
            repo_topics.extend(repo.get("topics"))
        repos_info.append(repo_info)

    # Sort repos by stars
    repos_info.sort(key=lambda x: x["stars"], reverse=True)

    # Format time with GitHub
    joined_date = datetime.strptime(user_data["created_at"], "%Y-%m-%dT%H:%M:%SZ").strftime("%B %Y")

    # Calculate topic frequency
    topic_frequency = Counter(repo_topics)
    # Combine all data
    github_data = {
        "profile": {
            "username": username,
            "name": user_data.get("name") or username,
            "bio": user_data.get("bio") or "No bio provided",
            "location": user_data.get("location") or "Location not specified",
            "public_repos": user_data["public_repos"],
            "followers": user_data["followers"],
            "following": user_data["following"],
            "joined_github": joined_date,
            "avatar_url": user_data["avatar_url"],
            "profile_url": user_data["html_url"]
        },
        "repositories": {
            "total_count": len(repos_info),
            "top_languages": languages_count,
            "top_repos": repos_info[:10],
            "topic_frequency": dict(topic_frequency.most_common(20)),
            "repos_info": repos_info
        }
    }

    return github_data

# Generate prompts for model
def create_analysis_prompt(github_data):
    profile = github_data["profile"]
    repos = github_data["repositories"]

    top_langs = sorted(repos["top_languages"].items(), key=lambda x: x[1], reverse=True)
    top_langs_str = ", ".join([f"{lang} ({count} repos)" for lang, count in top_langs[:5]])

    # Create detailed info about top 5 repos
    top_repos_details = ""
    for i, repo in enumerate(repos["top_repos"][:5], 1):
        top_repos_details += f"""
Repository #{i}: {repo['name']}
- Description: {repo['description']}
- Stars: {repo['stars']}
- Forks: {repo['forks']}
- Language: {repo['language'] or 'Not specified'}
- Last updated: {repo['updated_at']}
"""

    # Create the prompt
    prompt = f"""You are a GitHub profile analyst. Based on the following GitHub profile data, provide a comprehensive analysis.

Profile Information:
- Username: {profile['username']}
- Name: {profile['name']}
- Bio: {profile['bio']}
- Location: {profile['location']}
- Public Repositories: {profile['public_repos']}
- Followers: {profile['followers']}
- Following: {profile['following']}
- Joined GitHub: {profile['joined_github']}

Repository Information:
- Total Repositories: {repos['total_count']}
- Top Languages: {top_langs_str}

Top Repositories:{top_repos_details}

Your task is to analyze this profile and provide:
🧾 Overview
An overview of the developer's GitHub presence in minumum 2-3 sentences

💻 Main techonologies
Main technologies and skills based on repositories

📊 Development patterns

💪 Strengths
List the standout strengths or qualities — such as code quality, language versatility, open-source involvement, or strong project portfolios.

🛠️ Key Technologies and Skills
Highlight the main programming languages, frameworks, or tools the developer uses, based on repositories and top languages.
Format your analysis in markdown with clear sections.
"""
    return prompt


# Format repository list as markdown with star counts
def format_repo_list(github_data):
    repos = github_data["repositories"]["top_repos"]

    # Create a markdown table for repositories
    md_table = "## Top Repositories with Star Count\n\n"
    md_table += "| Repository | Description | Stars | Language |\n"
    md_table += "| ---------- | ----------- | ----- | -------- |\n"

    # Add each repository to the table
    for repo in repos[:15]:  # Limit to top 15
        name = repo["name"]
        url = repo["url"]
        desc = repo["description"].replace("|", "\\|") if repo["description"] else "No description"
        stars = repo["stars"]
        lang = repo["language"] or "Not specified"

        md_table += f"| [{name}]({url}) | {desc} | {stars} ⭐ | {lang} |\n"

    return md_table

# Create visualizations
def word_cloud_output(github_data):
    # Prepare list to store visualization images
    # Get topic data
    fig, ax = plt.subplots(figsize=(8, 6))
    topics = github_data["repositories"]["topic_frequency"]
    # print(topics)
    width= 800
    height= 400
    max_words= 100
    if topics and len(topics) > 3:
        # Create word cloud
        wordcloud = WordCloud(
            width=800,
            height=500,
            background_color='white',
            colormap='viridis',
            min_font_size=10,
            max_font_size=150,
            random_state=42
        ).generate_from_frequencies(topics)

    # Create a figure for the word cloud
    fig, ax = plt.subplots(figsize=(width/100, height/100))
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    plt.tight_layout(pad=0)

    return fig

def bar_chart(github_data):
    # Prepare list to store visualization images
    # Get topic data
    top_repos_details = github_data["repositories"]["top_repos"]
    print(top_repos_details)
    fig = px.bar(
        top_repos_details,
        x='name',
        y='stars',
        color='language',
        title='Top 10 Repositories by Stars',
        labels={'name': 'Repository Name', 'stars': 'Number of Stars'}
    )
    fig.update_xaxes(tickangle=45)
    return fig

def pie_chart(github_data):
    repos_info = github_data["repositories"]["repos_info"]
    fig_language = px.pie(
        repos_info,
        names='language',
        title='Repository Language Distribution',
        hole=0.3,
        color_discrete_sequence=px.colors.qualitative.Bold
    )
    fig_language.update_traces(textposition='inside', textinfo='percent+label')
    return fig_language

def time_line(github_data):
    repos_info = github_data["repositories"]["repos_info"]
    df=pd.DataFrame(repos_info)
    df['year_month'] = df['created_at'].dt.strftime('%Y-%m')
    timeline_data = df.groupby('year_month').size().reset_index(name='count')
    timeline_data = timeline_data.sort_values('year_month')

    fig_timeline = px.line(
        timeline_data,
        x='year_month',
        y='count',
        markers=True,
        title='Repository Creation Timeline',
        labels={'year_month': 'Year-Month', 'count': 'Number of Repositories Created'}
    )
    fig_timeline.update_xaxes(tickangle=45)
    return fig_timeline


# Function to analyze GitHub profile using model
def analyze_github_profile(username,model, tokenizer):
    if not username:
        return "Please enter a GitHub username."

    # Fetch and prepare data
    github_data = prepare_github_data(username)
    if isinstance(github_data, str):  # Error message
        return github_data

    # Create prompt for analysis
    prompt = create_analysis_prompt(github_data)

    # Generate analysis using model
    print("Generating analysis with Model...")
    analysis = generate_with_model(model, tokenizer, prompt)
    repo_table = format_repo_list(github_data)
    word_cloud_output_figure = word_cloud_output(github_data)

    bar_chart_figure=bar_chart(github_data)
    pie_chart_figure = pie_chart(github_data)
    time_line_figure = time_line(github_data)
    # Format the final output
    profile_url = github_data["profile"]["profile_url"]
    avatar_url = github_data["profile"]["avatar_url"]

    # Prepare the response with markdown
    response = f"""# GitHub Profile Analysis for [{github_data["profile"]["username"]}]({profile_url})

![Profile Avatar]({avatar_url})
{analysis}
{repo_table}

---
*Analysis generated using model*
"""
    return response, word_cloud_output_figure, bar_chart_figure, pie_chart_figure, time_line_figure

def create_gradio_interface(model, tokenizer):
    def analyze_profile(username):
        return analyze_github_profile(username,model, tokenizer)

    with gr.Blocks(css=".orange-button { background-color: orange !important; color: white !important; }",title="GitHub Profile Analyzer") as iface:
        gr.Markdown("## GitHub Profile Analyzer \n Enter a GitHub username to analyze their profile and repositories")

        with gr.Column():
            username_input = gr.Textbox(lines=1, placeholder="Enter GitHub username", label="GitHub Username")
            analyze_button = gr.Button("Analyze",elem_classes="orange-button")

        output = gr.Markdown(label="Analysis")

        with gr.Row():
          with gr.Column():
              word_cloud = gr.Plot(label="Word Cloud")
          with gr.Column():
              bar_chart_figure = gr.Plot(label="Bar Chart")

        with gr.Row():
          with gr.Column():
            pie_chart_figure = gr.Plot(label="Language Distribution")
          with gr.Column():
            time_line_figure = gr.Plot(label="Repository Creation Timeline")

        analyze_button.click(fn=analyze_profile, inputs=username_input, outputs=[output, word_cloud, bar_chart_figure, pie_chart_figure, time_line_figure])

    return iface

# Main execution
if __name__ == "__main__":
    # Step 1: Load the Llama 3 model
    print("Starting GitHub Analyzer...")
    print("This will download and load the model (this might take a few minutes)...")

    # Check if HF_TOKEN is set for accessing gated models
    if "HF_TOKEN" not in os.environ:
        print("Note: HF_TOKEN is not set. You may need to set it to access the models.")
        print("Run the following in a previous cell:")
        print("import os")
        print("os.environ['HF_TOKEN'] = 'your_huggingface_token_here'")
    # Load model
    model, tokenizer = load_model()
    # Step 2: Create and launch Gradio interface
    iface = create_gradio_interface(model, tokenizer)
    iface.launch(debug=True,share=True)  # share=True creates a public link