In [132]:
import pandas as pd

## Part 1: Percentages of Keywords by Industry

In [135]:
postings_df = pd.read_csv('/Users/sabrinaortiz/Desktop/dataProjects/Final_Project/project_data/postings.csv')
industries_df = pd.read_csv('/Users/sabrinaortiz/Desktop/dataProjects/Final_Project/project_data/companies/company_industries.csv')

In [137]:
# checking data types of job postings data
postings_df.dtypes

job_id                          int64
company_name                   object
title                          object
description                    object
max_salary                    float64
pay_period                     object
location                       object
company_id                    float64
views                         float64
med_salary                    float64
min_salary                    float64
formatted_work_type            object
applies                       float64
original_listed_time          float64
remote_allowed                float64
job_posting_url                object
application_url                object
application_type               object
expiry                        float64
closed_time                   float64
formatted_experience_level     object
skills_desc                    object
listed_time                   float64
posting_domain                 object
sponsored                       int64
work_type                      object
currency    

In [139]:
# dropping any NAs in the description column
postings_df = postings_df.dropna(subset=['description'])

In [141]:
# check for any duplicates in the industries df before merging it to the postings df
industries_df['company_id'].value_counts()

company_id
1687254     2
2471716     2
69192017    2
27292       2
2831596     2
           ..
37067372    1
33267093    1
9183828     1
165970      1
8060959     1
Name: count, Length: 24365, dtype: int64

In [143]:
# dropping duplicates since some company IDs have 2 industries
industries_df = industries_df.drop_duplicates(subset=['company_id'])

In [145]:
# merging the industries into the larger postings dataset
postings_df = pd.merge(postings_df, industries_df, how='left', on='company_id')

In [147]:
# creating a column to identify whether a job description contains keywords related to AI
## having an issue with filtering based on case sensitivity
postings_df['Mentions_AI'] = postings_df['description'].str.contains(
    r'\bAI\b|\bmachine learning\b|\bartificial intelligence\b|\bgenerative AI\b|\bdeep learning\b|\blarge language model\b',
    case=False,
    na=False
)

In [149]:
# checking to see how many posts were identified to include keywords
postings_df['Mentions_AI'].value_counts()

Mentions_AI
False    119465
True       4377
Name: count, dtype: int64

In [151]:
postings_df.head(5)

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips,industry,Mentions_AI
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0,Real Estate,False
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,,False
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0,Restaurants,False
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0,Law Practice,False
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0,,False


In [153]:
# creating a new df that shows the percentage of job descriptions in each industry group that mention one of the AI keywords
industry_percentages_df = postings_df.groupby('industry')['Mentions_AI'].agg(
                        percent=lambda x: x.mean() * 100,
                        count='count'
                        )

In [155]:
# outputting the df to check the percentages
industry_percentages_df.to_csv('/Users/sabrinaortiz/Desktop/dataProjects/Final_Project/project_data/industry_percentages.csv', index='False')

## Part 2: Creating sample dataset equivalent to 1% 

In [163]:
true_mentions_ai = postings_df[postings_df['Mentions_AI'] == True]

In [165]:
sample_postings_df = true_mentions_ai.sample(n=216, random_state=1)

In [167]:
sample_postings_df.to_csv('/Users/sabrinaortiz/Desktop/dataProjects/Final_Project/project_data/sample_of_posting_data.csv', index='False')