# Creating Random Datasets with recquired fields

In [2]:
import pandas as pd
import random

### Defining random valued lists

In [19]:
company_random = [
    "Apex Solutions",
    "Bright Future Enterprises",
    "Summit Innovations",
    "Quantum Tech",
    "Stellar Networks",
    "Infinite Horizons",
    "Nexus Industries",
    "Titan Corp",
    "Skybound Ventures",
    "Horizon Dynamics",
    "Pinnacle Systems",
    "Velocity Ventures",
    "Global Edge Solutions",
    "Blue Wave Technologies",
    "Stratosphere Enterprises",
    "Alpha Omega Co.",
    "Radiant Ventures",
    "NextGen Solutions",
    "Fusion Enterprises",
    "OmniTech Industries",
    "Vertex Innovations",
    "SolarFlare Tech",
    "Cascade Dynamics",
    "Visionary Ventures",
    "Zenith Enterprises"]

frequency_random = [
    'One Time',
    'Daily',
    'Every Two Month',
    'Half Yearly',
    'Monthly',
    'Monthly Twice',
    'Quaterly',
    'Weekly Thrice',
    'Weekly Twice',
    'Yearly']

names = [
    "Ava",
    "Liam",
    "Emma",
    "Noah",
    "Olivia",
    "William",
    "Sophia",
    "James",
    "Isabella",
    "Benjamin",
    "Mia",
    "Lucas",
    "Amelia",
    "Mason",
    "Harper",
    "Elijah",
    "Evelyn",
    "Oliver",
    "Abigail",
    "Jacob",
    "Charlotte",
    "Alexander",
    "Scarlett",
    "Michael",
    "Aria",
    "Daniel",
    "Ella",
    "Henry",
    "Grace",
    "Jackson",
    "Lily",
    "Sebastian",
    "Zoey",
    "Aiden",
    "Hannah",
    "Matthew",
    "Chloe"]

revenue_random =['45120', '58230', '32450', '67890', '74320', '21900', '89450', '52780', '63510', '39840', '94260', '47670','526']

region_random = ['United States', 'China', 'India', 'Brazil', 'Japan', 'Germany', 'United Kingdom', 'France', 'Canada', 'Australia']

industry_random = [
    "Business Intelligence",
    "Finance and Banking",
    "Healthcare",
    "Retail and E-commerce",
    "Telecommunications",
    "Manufacturing",
    "Marketing and Advertising",
    "Education",
    "Transportation and Logistics",
    "Energy and Utilities"]

company_size_random =[50,100,500,1000,10000,50000,100000]

In [25]:
import random

# Generate 500 random unique numbers in the range (4000-4500)
spider_numbers = random.sample(range(4000, 4501), 500)

# Initialize list to hold dictionaries
data_list = []

# Loop to create 10,000 entries
for i in range(10000):
    name = random.choice(names)
    company = random.choice(company_random)
    frequency = random.choice(frequency_random)
    spider = spider_numbers[i % len(spider_numbers)]  # Cycle through spider_numbers
    revenue = random.choice(revenue_random)
    region = random.choice(region_random)
    industry = random.choice(industry_random)
    company_size = random.choice(company_size_random)
    email = f"{name.lower()}@{company.replace(' ', '').lower()}.com"
    
    data_dict = {
        "email": email,
        "company": company,
        "frequency": frequency,
        "spider": spider,
        "revenue": revenue,
        "region": region,
        "industry": industry,
        "company_size": company_size
    }
    
    data_list.append(data_dict)


In [26]:
df = pd.DataFrame(data_list)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   email         10000 non-null  object
 1   company       10000 non-null  object
 2   frequency     10000 non-null  object
 3   spider        10000 non-null  int64 
 4   revenue       10000 non-null  object
 5   region        10000 non-null  object
 6   industry      10000 non-null  object
 7   company_size  10000 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 625.1+ KB


In [27]:
df.head()

Unnamed: 0,email,company,frequency,spider,revenue,region,industry,company_size
0,oliver@vertexinnovations.com,Vertex Innovations,Weekly Twice,4167,21900,China,Energy and Utilities,500
1,isabella@fusionenterprises.com,Fusion Enterprises,Yearly,4453,63510,Canada,Finance and Banking,50
2,scarlett@globaledgesolutions.com,Global Edge Solutions,Every Two Month,4138,63510,Brazil,Finance and Banking,100
3,ava@brightfutureenterprises.com,Bright Future Enterprises,Every Two Month,4284,94260,United States,Healthcare,500
4,evelyn@omnitechindustries.com,OmniTech Industries,Quaterly,4403,74320,United Kingdom,Finance and Banking,50


# Data Processing

In [4]:
from opencage.geocoder import OpenCageGeocode
import re

In [None]:
# Load your dataset
df = pd.read_csv('your_dataset.csv')



address_column = 'street_address'

# Initialize the OpenCage geocoder with your API key
api_key = 'YOUR_OPENCAGE_API_KEY'
geocoder = OpenCageGeocode(api_key)

# Function to get country from an address
def get_country(address):
    try:
        results = geocoder.geocode(address)
        if results:
            return results[0]['components'].get('country')
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

# Apply the function to your address column
df['country'] = df[address_column].apply(get_country)







# Function to parse the employee range and return an approximate number
def parse_employees(value):
    if pd.isna(value):  # Check for None or NaN
        return None

    # Match patterns like '1K-5K', '501-1K', '10K+', '51-200'
    match = re.match(r'(\d+)(K?)\s*-\s*(\d+)?(K?)|(\d+)(K?)\+', value)

    if match:
        low, low_k, high, high_k, single, single_k = match.groups()

        if single:  # Handle '10K+' case
            return int(single) * (1000 if single_k else 1)

        low_value = int(low) * (1000 if low_k else 1)
        high_value = int(high or 0) * (1000 if high_k else 1)

        # Return the higher end of the range as an estimate
        return max(low_value, high_value)

    # If no pattern matched, return None
    return None

# Apply the parsing function
df['mapped_employees'] = df['Number Of Employees'].apply(parse_employees)





# Function to parse the follower count
def parse_followers(value):
    if pd.isna(value):  # Check for None or NaN
        return None

    # Match patterns like '3K followers', '384 followers', '10K followers'
    match = re.match(r'(\d+(\.\d+)?)\s*(K|M)?\s*followers', value)

    if match:
        number, _, scale = match.groups()

        number = float(number)  # Convert number to float for cases like '1.5K'
        
        if scale == 'K':
            return int(number * 1000)
        elif scale == 'M':
            return int(number * 1000000)
        else:
            return int(number)

    # If no pattern matched, return None
    return None

# Apply the parsing function
df['mapped_followers'] = df['Followers'].apply(parse_followers)




# Mapping dictionary
industry_mapping = {
    'Advertising Services': 'Advertising and Marketing',
    'Business Content': 'Advertising and Marketing',
    
    'Airlines and Aviation': 'Aviation and Transportation',
    'Maritime': 'Aviation and Transportation',
    'Transportation, Logistics, Supply Chain and Storage': 'Aviation and Transportation',
    'Truck Transportation': 'Aviation and Transportation',
    
    'Appliances, Electrical, and Electronics Manufacturing': 'Consumer Goods and Retail',
    'Automotive': 'Consumer Goods and Retail',
    'Consumer Goods': 'Consumer Goods and Retail',
    'Retail': 'Consumer Goods and Retail',
    'Retail Apparel and Fashion': 'Consumer Goods and Retail',
    'Wholesale': 'Consumer Goods and Retail',
    'Wholesale Building Materials': 'Consumer Goods and Retail',
    
    'Education': 'Education and Training',
    'Wellness and Fitness Services': 'Education and Training',
    
    'Health, Wellness & Fitness': 'Healthcare and Medical',
    'Hospitals and Health Care': 'Healthcare and Medical',
    'Medical Equipment Manufacturing': 'Healthcare and Medical',
    'Medical Practices': 'Healthcare and Medical',
    
    'Food and Beverage Services': 'Hospitality and Travel',
    'Hospitality': 'Hospitality and Travel',
    'Travel Arrangements': 'Hospitality and Travel',
    
    'Computer Hardware Manufacturing': 'IT and Technology',
    'IT Services and IT Consulting': 'IT and Technology',
    'Software Development': 'IT and Technology',
    'Technology, Information and Internet': 'IT and Technology',
    
    'Engines and Power Transmission Equipment Manufacturing': 'Manufacturing',
    'Industrial Machinery Manufacturing': 'Manufacturing',
    'Machinery Manufacturing': 'Manufacturing',
    'Manufacturing': 'Manufacturing',
    'Textile Manufacturing': 'Manufacturing',
    
    'Book and Periodical Publishing': 'Media and Publishing',
    'Newspaper Publishing': 'Media and Publishing',
    'Performing Arts': 'Media and Publishing',
    
    'Business Consulting and Services': 'Professional Services',
    'Law Practice': 'Professional Services',
    'Legal Services': 'Professional Services',
    'Staffing and Recruiting': 'Professional Services',
    
    'Real Estate': 'Real Estate',
    'Real Estate Agents and Brokers': 'Real Estate',
    
    'Government Administration': 'Non-Profit and Government',
    'Non-profit Organizations': 'Non-Profit and Government',
    
    'Entertainment Providers': 'Entertainment'
}

# Map the values
df['Category'] = df['Industry'].map(industry_mapping)