In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
sourcePath = r"C:\coding\leetcode-company-wise-problems"

In [3]:
dirList = []
for name in os.listdir(sourcePath):
    path = f"{sourcePath}\\{name}"
    if os.path.isdir(path):
        dirList.append(path)
del dirList[:4]
dirList[0]

'C:\\coding\\leetcode-company-wise-problems\\Accenture'

In [4]:
def processCompanyProblems(dirList):
    allData = []
    for path in dirList:
        companyName = os.path.basename(path)
        try:
            tempDf = pd.read_csv(f"{path}\\5. All.csv")
            tempDf['Company'] = companyName
            allData.append(tempDf)
            print(f"Processed {companyName}: {len(tempDf)} problems")
        except FileNotFoundError:
            print(f"File not found for {companyName}")
            continue

    if allData:
        combinedDf = pd.concat(allData, axis=0, ignore_index=True)
        print(f"Total problems before dedup: {len(combinedDf)}")
        return combinedDf
    else:
        return pd.Dataframe()

In [5]:
def create_final_result_with_difficulty_sort(combined_df):
    """
    Create final result with proper difficulty sorting (Easy -> Medium -> Hard)
    """
    # Group by Title to get all companies for each problem
    problem_companies = combined_df.groupby('Title')['Company'].apply(
        lambda x: sorted(list(set(x)))  # Sort companies alphabetically
    ).reset_index()
    problem_companies.columns = ['Title', 'Companies']
    
    # Get unique problem details
    unique_problems = combined_df.drop_duplicates(subset=['Title']).copy()
    
    # Merge companies back
    unique_problems = unique_problems.merge(problem_companies, on='Title', how='left')
    
    # Create difficulty ranking
    difficulty_order = {'EASY': 1, 'MEDIUM': 2, 'HARD': 3}
    unique_problems['difficulty_rank'] = unique_problems['Difficulty'].map(difficulty_order)
    
    # Create expanded dataframe for each topic
    final_rows = []
    
    for idx, row in unique_problems.iterrows():
        if pd.notna(row['Topics']):
            topics = [topic.strip() for topic in row['Topics'].split(',')]
            
            for topic in topics:
                final_rows.append({
                    'Problem_Type': topic,
                    'Companies': ', '.join(row['Companies']),  # Join as string
                    'Company_List': row['Companies'],  # Keep as list for analysis
                    'Company_Count': len(row['Companies']),
                    'Title': row['Title'],
                    'Difficulty': row['Difficulty'],
                    'difficulty_rank': row['difficulty_rank'],  # Add ranking for sorting
                    'Frequency': row['Frequency'],
                    'Acceptance_Rate': row['Acceptance Rate'],
                    'Link': row['Link']
                })
    
    finalDf = pd.DataFrame(final_rows)
    
    # Sort by Problem Type, then by difficulty (Easy->Medium->Hard), then by Company Count (descending), then by Frequency
    finalDf = finalDf.sort_values(['Problem_Type', 'difficulty_rank', 'Company_Count', 'Frequency'], 
                                   ascending=[True, True, False, False])
    
    # Remove the helper column
    finalDf = finalDf.drop('difficulty_rank', axis=1)
    
    return finalDf

In [6]:
print("Processing company directories...")
combined_df = processCompanyProblems(dirList)

if not combined_df.empty:
    print(f"Creating final result...")
    final_result = create_final_result_with_difficulty_sort(combined_df)
    
    print(f"Final result: {len(final_result)} rows")
    
    # Display sample results
    print("\nSample results:")
    print(final_result.head(10))
    
    # Save to CSV if needed
    final_result.to_csv('leetcode_problems_by_type_and_company.csv', index=False)
    print("Results saved to 'leetcode_problems_by_type_and_company.csv'")

Processing company directories...
Processed Accenture: 100 problems
Processed Accolite: 17 problems
Processed Acko: 4 problems
Processed Activision: 3 problems
Processed Adobe: 100 problems
Processed Affirm: 11 problems
Processed Agoda: 40 problems
Processed Airbnb: 58 problems
Processed Airbus SE: 5 problems
Processed Airtel: 6 problems
Processed Airwallex: 1 problems
Processed Akamai: 7 problems
Processed Akuna Capital: 23 problems
Processed Alibaba: 5 problems
Processed Altimetrik: 4 problems
Processed Amadeus: 4 problems
Processed Amazon: 100 problems
Processed AMD: 11 problems
Processed Amdocs: 7 problems
Processed American Express: 23 problems
Processed Analytics quotient: 2 problems
Processed Anduril: 32 problems
Processed Aon: 4 problems
Processed Apollo.io: 4 problems
Processed AppDynamics: 2 problems
Processed AppFolio: 4 problems
Processed Apple: 100 problems
Processed Applied Intuition: 12 problems
Processed AQR Capital Management: 3 problems
Processed Arcesium: 20 problems

NameError: name 'createFinalResult' is not defined

In [None]:
# Alternative version with better formatting for companies
def create_final_result_formatted(combined_df):
    """
    Create final result with better formatting
    """
    # Group by Title to get all companies for each problem
    problem_companies = combined_df.groupby('Title')['Company'].apply(
        lambda x: sorted(list(set(x)))  # Sort companies alphabetically
    ).reset_index()
    problem_companies.columns = ['Title', 'Companies']
    
    # Get unique problem details
    unique_problems = combined_df.drop_duplicates(subset=['Title']).copy()
    
    # Merge companies back
    unique_problems = unique_problems.merge(problem_companies, on='Title', how='left')
    
    # Create expanded dataframe for each topic
    final_rows = []
    
    for idx, row in unique_problems.iterrows():
        if pd.notna(row['Topics']):
            topics = [topic.strip() for topic in row['Topics'].split(',')]
            
            for topic in topics:
                final_rows.append({
                    'Problem_Type': topic,
                    'Companies': ', '.join(row['Companies']),  # Join as string
                    'Company_List': row['Companies'],  # Keep as list for analysis
                    'Company_Count': len(row['Companies']),
                    'Title': row['Title'],
                    'Difficulty': row['Difficulty'],
                    'Frequency': row['Frequency'],
                    'Acceptance_Rate': row['Acceptance Rate'],
                    'Link': row['Link']
                })
    
    final_df = pd.DataFrame(final_rows)
    
    # Sort by Problem Type, then by Company Count (descending)
    final_df = final_df.sort_values(['Problem_Type', 'Company_Count', 'Frequency'], 
                                   ascending=[True, False, False])
    
    return final_df

# Usage with formatted version
final_result_formatted = create_final_result_formatted(combined_df)

In [None]:
# Analysis functions
def analyze_results(final_df):
    """
    Analyze the final results
    """
    print("=== ANALYSIS ===")
    
    # Top problem types by count
    print("\nTop Problem Types:")
    problem_type_counts = final_df['Problem_Type'].value_counts().head(10)
    print(problem_type_counts)
    
    # Problems used by most companies
    print("\nProblems used by most companies:")
    most_popular = final_df.nlargest(10, 'Company_Count')[['Title', 'Problem_Type', 'Company_Count', 'Companies']]
    print(most_popular)
    
    # Company statistics
    print(f"\nTotal unique companies: {len(set([comp for comp_list in final_df['Company_List'] for comp in comp_list]))}")
    print(f"Total unique problems: {final_df['Title'].nunique()}")
    print(f"Total problem-type combinations: {len(final_df)}")

# Run analysis
analyze_results(final_result_formatted)

In [None]:
# Filter specific results
def filter_by_problem_type(final_df, problem_type):
    """
    Filter results by specific problem type
    """
    filtered = final_df[final_df['Problem_Type'] == problem_type].copy()
    return filtered.sort_values(['Company_Count', 'Frequency'], ascending=[False, False])

# Example: Get all Array problems
array_problems = filter_by_problem_type(final_result_formatted, 'Array')
print("Array Problems:")
print(array_problems[['Title', 'Companies', 'Company_Count', 'Frequency', 'Acceptance_Rate']].head())