# Assignment 1

## Part 1: Extract Data from Wikipedia


### 1. Install scrapy

In [1]:
!pip install scrapy



### 2. Create project

In [None]:
!scrapy startproject ощиы

Error: scrapy.cfg already exists in /Users/polinakorobeinikova/IU/Data Wrangling and Visualisation/Assignment 1/1 Part/highest_grossing_films


### 3. Create Spider 
Create *parse_jobs.py* for parsing data in the directory *jobs/jobs/spiders* with the code below

In [None]:
import scrapy
import uuid
from urllib.parse import urljoin
from scrapy.http import Request

class HabrSpider(scrapy.Spider):
    name = 'parse_jobs'
    allowed_domains = ['career.habr.com']
    start_urls = [
        'https://career.habr.com/vacancies?s[]=2&s[]=3&s[]=4&s[]=82&s[]=72&s[]=5&s[]=75&s[]=6&s[]=1&s[]=77&s[]=7&s[]=83&s[]=84&s[]=73&s[]=8&s[]=85&s[]=86&s[]=188&s[]=178&s[]=106&s[]=78&s[]=21&s[]=172&s[]=174&s[]=79&s[]=173&s[]=80&s[]=176&s[]=81&s[]=118&s[]=182&s[]=44&s[]=125&s[]=177&s[]=175&s[]=126&s[]=98&s[]=41&s[]=42&s[]=99&s[]=168&s[]=43&s[]=76&s[]=96&s[]=97&s[]=95&s[]=100&s[]=133&s[]=111&type=all'
    ]
    custom_settings = {
        'DOWNLOAD_DELAY': 1,
        'FEEDS': {
            'habr_jobs.json': {
                'format': 'json',
                'encoding': 'utf8',
                'store_empty': False,
                'indent': 4,
            }
        },
        'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
        'DEFAULT_REQUEST_HEADERS': {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
        },
        'RETRY_TIMES': 3,
        'RETRY_HTTP_CODES': [403, 429],
        'ROBOTSTXT_OBEY': False,  # Check Habr's terms
    }
    job_count = 0
    max_jobs = 1500

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url, callback=self.parse, meta={'dont_redirect': False})

    def parse(self, response):
        if response.status in [403, 429]:
            self.logger.error(f"Received {response.status} on {response.url}. Check bot detection.")
            return

        # Extract job cards
        job_cards = response.css('div.vacancy-card')
        if not job_cards:
            self.logger.warning(f"No job cards found on {response.url}. Check HTML structure.")
            self.logger.debug(f"Response body: {response.text[:1000]}")

        for job in job_cards:
            if self.job_count >= self.max_jobs:
                return

            job_link = job.css('a.vacancy-card__title-link::attr(href)').get()
            if job_link:
                self.job_count += 1
                full_link = urljoin('https://career.habr.com', job_link)
                job_data = self.extract_job_data(job)
                yield Request(full_link, callback=self.parse_job, meta={'job_data': job_data})

        # Follow pagination
        next_page = response.css('a.next_page::attr(href)').get()
        if next_page and self.job_count < self.max_jobs:
            next_page_url = urljoin('https://career.habr.com', next_page)
            yield Request(next_page_url, callback=self.parse, meta={'dont_redirect': False})

    def extract_job_data(self, job):
        """Extract data from job card, including skills."""
        company = job.css('div.vacancy-card__company-title a::text').get(default='N/A').strip()
        company_link = job.css('div.vacancy-card__company-title a::attr(href)').get(default='N/A')
        company_link = urljoin('https://career.habr.com', company_link) if company_link != 'N/A' else 'N/A'
        title = job.css('a.vacancy-card__title-link::text').get(default='N/A').strip()
        work_format = job.css('div.vacancy-card__meta span::text').get(default='N/A').strip()
        skills = job.css('div.vacancy-card__skills a.link-comp::text').getall()
        skills = [skill.strip() for skill in skills if skill.strip()]

        return {
            'company': company,
            'company_link': company_link,
            'title': title,
            'work_format': work_format,
            'skills': skills
        }

    def parse_job(self, response):
        job_data = response.meta['job_data']

        yield {
            'id': str(uuid.uuid4()),
            'title': job_data['title'],
            'company': job_data['company'],
            'url': response.url,
            'company_link': job_data['company_link'],
            'work_format': job_data['work_format'],
            'skills': job_data['skills']
        }

### 4. Running the Spider 
Run the following command in folder *highest_grossing_films* in your terminal to execute the Scrapy spider:

    scrapy crawl parse_jobs -o jobs.json

## Part 2: Data Cleaning

### 1. Import libraries

In [2]:
import pandas as pd
import json
import re

### 2. Clean data
1. Extract job specialization and level
2. Divide all vacancies to *Software Development*, *Analytics*, *Information Security*, *Artificial Intelligence*

In [4]:
# Define categorization mappings
software_dev = [
    'Бэкенд разработчик', 'Backend Developer', 'Фронтенд разработчик', 'Frontend Developer',
    'Фулстек разработчик', 'Fullstack Developer', 'Веб-разработчик', 'Web Developer',
    'Разработчик приложений', 'Application Developer', 'Разработчик мобильных приложений', 'Mobile Application Developer',
    'Релиз менеджер', 'Release Manager', 'Разработчик игр', 'Game Developer',
    'Десктоп разработчик', 'Software Developer', 'Разработчик баз данных', 'Database Developer',
    'Инженер встраиваемых систем', 'Embedded Software Engineer', 'HTML-верстальщик', 'HTML Coding',
    'Программист 1С', '1C Developer', 'Архитектор программного обеспечения', 'Software Architect',
    'Системный инженер', 'System Software Engineer', 'ERP-программист', 'ERP Developer',
    'Архитектор баз данных', 'Database Architect', 'Инженер электронных устройств', 'Hardware Engineer',
    'Архитектор 1С', '1C Architect'
]

analytics = [
    'Аналитик мобильных приложений', 'Mobile Analyst', 'Системный аналитик', 'Systems Analyst',
    'Бизнес-аналитик', 'Business Analyst', 'Гейм-аналитик', 'Game Analyst',
    'UX-аналитик', 'UX Analyst', 'Аналитик по данным', 'Data Analyst',
    'Инженер по данным', 'Data Engineer', 'Программный аналитик', 'Software Analyst',
    'Продуктовый аналитик', 'Product Analyst', 'BI-разработчик', 'BI Developer',
    'Веб-аналитик', 'Web Analyst', 'Аналитик 1С', '1С Analyst'
]

info_security = [
    'Пентестер', 'Pentester', 'Администратор защиты', 'Security Administrator',
    'Аналитик SOC', 'SOC Analyst', 'Специалист по информационной безопасности', 'Information Security Specialist',
    'Специалист по реверс-инжинирингу', 'Reverse Engineer', 'AppSec-инженер', 'AppSec-Engineer',
    'Инженер по безопасности', 'Security Engineer', 'NLP-инженер', 'NLP-Engineer',
    'Антифрод аналитик', 'Antifraud Analyst', 'Архитектор информационной безопасности', 'Information Security Architect'
]

ai = [
    'Ученый по данным', 'Data Scientist', 'ML разработчик', 'ML Engineer',
    'Инженер по компьютерному зрению', 'Computer Vision Engineer'
]

In [8]:
# Function to categorize specialization
def categorize_job(specialization):
    spec = specialization.strip() if specialization else 'Other'
    if spec in software_dev:
        return 'Software Development'
    elif spec in analytics:
        return 'Analytics'
    elif spec in info_security:
        return 'Information Security'
    elif spec in ai:
        return 'Artificial Intelligence'
    return 'Other'

In [12]:
file_path = "jobs.json" 
df = pd.read_json(file_path)

# Extract job specialization and level
df['specialization'] = df['skills'].apply(lambda x: x[0] if x and len(x) > 0 else 'Other')
df['level'] = df['skills'].apply(lambda x: x[1] if x[1] in ["Средний (Middle)", "Старший (Senior)", "Ведущий (Lead)", "Младший (Junior)", 'Стажёр (Intern)'] and len(x) > 1 else 'Not specified')
df['skills'] = df['skills'].apply(lambda x: x[2:] if x[1] in ["Средний (Middle)", "Старший (Senior)", "Ведущий (Lead)", "Младший (Junior)", 'Стажёр (Intern)'] and len(x) > 2 else x[1:])

# Categorize vacancies
df['category'] = df['specialization'].apply(categorize_job)

# Save the cleaned dataset as a new JSON file
cleaned_file_path = "cleaned_jobs.json"
df.to_json(cleaned_file_path, orient="records", indent=4, force_ascii=False)

Now when data cleaning is done, we can proceed to Database