In [1]:
import pandas as pd
import numpy as np
import re
from pyspark.sql.functions import when
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [2]:
def save_grouped_data_to_csv(df, group_column, columns_not_to_keep, position_level):
    
    grouped_df = df.toPandas().groupby(group_column)
    
    for name, group in grouped_df:
        print("name : ", name)
        filtered_df = group.drop(columns=columns_not_to_keep)
        first_column_name = filtered_df.columns[0].lower()

        sanitized_name = name.replace(" ", "-").replace("/", "-").replace(",", "")
        sanitized_name = re.sub(r'-+', '-', sanitized_name)  # Replace multiple hyphens with a single one
        sanitized_name = sanitized_name.strip('-')  # Remove leading and trailing hyphens
        
        file_path = f'../created_csv/result/{position_level}/{sanitized_name}_{first_column_name}.csv'
        
        filtered_df.to_csv(file_path, index=False)
        
def update_counts(filtered_df, merge_df, filter_col, merge_col, devtype_col, count_col='Count'):
    updated_df = filtered_df.alias('f').join(
        merge_df.alias('m'),
        (F.col('f.' + filter_col) == F.col('m.' + merge_col)) & (F.col('f.DevType') == devtype_col),
        'left'
    ).withColumn(
        count_col,
        F.when(F.col('m.' + merge_col).isNull(), F.col('f.' + count_col)).otherwise(F.col('f.' + count_col) + F.col('m.' + count_col))
    ).select('f.*')  # Selecting all columns from the filtered_df

    return updated_df

def transform_and_save_data(df, position_level, file_name):
    # Drop columns with unnamed header
    df = df.select([col for col in df.columns if not col.startswith('Unnamed')])

    # Calculate rank based on '_c1' (assuming it's the correct column for 'Count')
    transformed_df = df.withColumn('Rank', F.dense_rank().over(Window.orderBy(F.desc('_c1'))))

    # Extract developer name from file_name using regex
    pattern = r'_(.*?)_'
    match = re.search(pattern, file_name)
    dev_name = match.group(1)

    # Get the first column name (assuming it's the primary identifier)
    first_column_name = df.columns[0].lower()

    # Sanitize developer name for file naming
    sanitized_name = dev_name.replace(" ", "-").replace("/", "-").replace(",", "")
    sanitized_name = re.sub(r'-+', '-', sanitized_name).strip('-')

    # Construct file path for saving the CSV file
    file_path = f'../created_csv/result/{position_level}/{dev_name}_{first_column_name}.csv'

    # Save the transformed DataFrame to CSV
    transformed_df.toPandas().to_csv(file_path, index=False)

### 사용할 Job List

In [3]:
filtered_job_list = [
'BlockChain',
'Developer, full-stack',
'Developer, back-end',
'Developer, front-end',
'Developer, game or graphics',
'Developer, mobile',
'UX / UI Designer',
'Engineer, data',
'Database administrator',
'Data scientist or machine learning specialist',
'Data or business analyst',
'Product manager',
'Developer Advocate',
'Developer, QA or test',
'DevOps specialist'
]

### Language

In [4]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("stackoverflow_processing") \
    .getOrCreate()

# Define file paths
junior_path = "../stak_overflow_processing/junior_devtype_language.csv"
middle_path = "../stak_overflow_processing/middle_devtype_language.csv"
senior_path = "../stak_overflow_processing/senior_devtype_language.csv"

# Read CSV files into PySpark DataFrames
junior_devtype_language_df = spark.read.csv(junior_path, header=True)
middle_devtype_language_df = spark.read.csv(middle_path, header=True)
senior_devtype_language_df = spark.read.csv(senior_path, header=True)

# 정의한 직업 리스트에 속하는 자료만 filter
junior_filtered_devtype_language_df = junior_devtype_language_df.filter(col('DevType').isin(filtered_job_list))
middle_filtered_devtype_language_df = middle_devtype_language_df.filter(col('DevType').isin(filtered_job_list))
senior_filtered_devtype_language_df = senior_devtype_language_df.filter(col('DevType').isin(filtered_job_list))

# unique_langugaes
junior_unique_languages = junior_filtered_devtype_language_df.select('Language').distinct().rdd.map(lambda r: r[0]).collect()
middle_unique_languages = middle_filtered_devtype_language_df.select('Language').distinct().rdd.map(lambda r: r[0]).collect()
senior_unique_languages = senior_filtered_devtype_language_df.select('Language').distinct().rdd.map(lambda r: r[0]).collect()

junior_data_business_analyst_path = "../created_csv/junior/junior_Data-or-business-analyst_primary_proglang_percentages_df.csv"
junior_database_administrator_path = "../created_csv/junior/junior_Database-administrator_primary_proglang_percentages_df.csv"
junior_developer_advocate_path = "../created_csv/junior/junior_Developer-Advocate_primary_proglang_percentages_df.csv"
junior_Developer_back_end_path = "../created_csv/junior/junior_Developer-back-end_primary_proglang_percentages_df.csv"
junior_Developer_front_end_path = "../created_csv/junior/junior_Developer-front-end_primary_proglang_percentages_df.csv"
junior_Developer_full_stack_path = "../created_csv/junior/junior_Developer-full-stack_primary_proglang_percentages_df.csv"
junior_Developer_mobile_path = "../created_csv/junior/junior_Developer-mobile_primary_proglang_percentages_df.csv"
junior_DevOps_specialist_path = "../created_csv/junior/junior_DevOps-specialist_primary_proglang_percentages_df.csv"
junior_UX_UI_Designer_path = "../created_csv/junior/junior_UX-UI-Designer_primary_proglang_percentages_df.csv"

# Read CSV files into PySpark DataFrames
junior_data_business_analyst_df = spark.read.csv(junior_data_business_analyst_path, header=True)
junior_database_administrator_df = spark.read.csv(junior_database_administrator_path, header=True)
junior_developer_advocate_df = spark.read.csv(junior_developer_advocate_path, header=True)
junior_Developer_back_end_df = spark.read.csv(junior_Developer_back_end_path, header=True)
junior_Developer_front_end_df = spark.read.csv(junior_Developer_front_end_path, header=True)
junior_Developer_full_stack_df = spark.read.csv(junior_Developer_full_stack_path, header=True)
junior_Developer_mobile_df = spark.read.csv(junior_Developer_mobile_path, header=True)
junior_DevOps_specialist_df = spark.read.csv(junior_DevOps_specialist_path, header=True)
junior_UX_UI_Designer_df = spark.read.csv(junior_UX_UI_Designer_path, header=True)

# Middle category
middle_data_business_analyst_path = "../created_csv/middle/middle_Data-or-business-analyst_primary_proglang_percentages_df.csv"
middle_database_administrator_path = "../created_csv/middle/middle_Database-administrator_primary_proglang_percentages_df.csv"
middle_developer_advocate_path = "../created_csv/middle/middle_Developer-Advocate_primary_proglang_percentages_df.csv"
middle_Developer_back_end_path = "../created_csv/middle/middle_Developer-back-end_primary_proglang_percentages_df.csv"
middle_Developer_front_end_path = "../created_csv/middle/middle_Developer-front-end_primary_proglang_percentages_df.csv"
middle_Developer_full_stack_path = "../created_csv/middle/middle_Developer-full-stack_primary_proglang_percentages_df.csv"
middle_Developer_mobile_path = "../created_csv/middle/middle_Developer-mobile_primary_proglang_percentages_df.csv"
middle_DevOps_specialist_path = "../created_csv/middle/middle_DevOps-specialist_primary_proglang_percentages_df.csv"
middle_UX_UI_Designer_path = "../created_csv/middle/middle_UX-UI-Designer_primary_proglang_percentages_df.csv"

# Read CSV files into PySpark DataFrames
middle_data_business_analyst_df = spark.read.csv(middle_data_business_analyst_path, header=True)
middle_database_administrator_df = spark.read.csv(middle_database_administrator_path, header=True)
middle_developer_advocate_df = spark.read.csv(middle_developer_advocate_path, header=True)
middle_Developer_back_end_df = spark.read.csv(middle_Developer_back_end_path, header=True)
middle_Developer_front_end_df = spark.read.csv(middle_Developer_front_end_path, header=True)
middle_Developer_full_stack_df = spark.read.csv(middle_Developer_full_stack_path, header=True)
middle_Developer_mobile_df = spark.read.csv(middle_Developer_mobile_path, header=True)
middle_DevOps_specialist_df = spark.read.csv(middle_DevOps_specialist_path, header=True)
middle_UX_UI_Designer_df = spark.read.csv(middle_UX_UI_Designer_path, header=True)

# Senior category
senior_data_business_analyst_path = "../created_csv/senior/senior_Data-or-business-analyst_primary_proglang_percentages_df.csv"
senior_database_administrator_path = "../created_csv/senior/senior_Database-administrator_primary_proglang_percentages_df.csv"
senior_developer_advocate_path = "../created_csv/senior/senior_Developer-Advocate_primary_proglang_percentages_df.csv"
senior_Developer_back_end_path = "../created_csv/senior/senior_Developer-back-end_primary_proglang_percentages_df.csv"
senior_Developer_front_end_path = "../created_csv/senior/senior_Developer-front-end_primary_proglang_percentages_df.csv"
senior_Developer_full_stack_path = "../created_csv/senior/senior_Developer-full-stack_primary_proglang_percentages_df.csv"
senior_Developer_mobile_path = "../created_csv/senior/senior_Developer-mobile_primary_proglang_percentages_df.csv"
senior_DevOps_specialist_path = "../created_csv/senior/senior_DevOps-specialist_primary_proglang_percentages_df.csv"
senior_UX_UI_Designer_path = "../created_csv/senior/senior_UX-UI-Designer_primary_proglang_percentages_df.csv"

# Read CSV files into PySpark DataFrames
senior_data_business_analyst_df = spark.read.csv(senior_data_business_analyst_path, header=True)
senior_database_administrator_df = spark.read.csv(senior_database_administrator_path, header=True)
senior_developer_advocate_df = spark.read.csv(senior_developer_advocate_path, header=True)
senior_Developer_back_end_df = spark.read.csv(senior_Developer_back_end_path, header=True)
senior_Developer_front_end_df = spark.read.csv(senior_Developer_front_end_path, header=True)
senior_Developer_full_stack_df = spark.read.csv(senior_Developer_full_stack_path, header=True)
senior_Developer_mobile_df = spark.read.csv(senior_Developer_mobile_path, header=True)
senior_DevOps_specialist_df = spark.read.csv(senior_DevOps_specialist_path, header=True)
senior_UX_UI_Designer_df = spark.read.csv(senior_UX_UI_Designer_path, header=True)

update_counts(junior_filtered_devtype_language_df, junior_data_business_analyst_df, 'Language', 'primary_proglang', 'Data or business analyst', 'Count')
update_counts(junior_filtered_devtype_language_df, junior_database_administrator_df, 'Language', 'primary_proglang', 'Database administrator', 'Count')
update_counts(junior_filtered_devtype_language_df, junior_developer_advocate_df, 'Language', 'primary_proglang', 'Developer Advocate', 'Count')
update_counts(junior_filtered_devtype_language_df, junior_Developer_back_end_df, 'Language', 'primary_proglang', 'Developer, back-end', 'Count')
update_counts(junior_filtered_devtype_language_df, junior_Developer_front_end_df, 'Language', 'primary_proglang', 'Developer, front-end', 'Count')
update_counts(junior_filtered_devtype_language_df, junior_Developer_full_stack_df, 'Language', 'primary_proglang', 'Developer, full-stack', 'Count')
update_counts(junior_filtered_devtype_language_df, junior_Developer_mobile_df, 'Language', 'primary_proglang', 'Developer, mobile', 'Count')
update_counts(junior_filtered_devtype_language_df, junior_DevOps_specialist_df, 'Language', 'primary_proglang', 'DevOps specialist', 'Count')
update_counts(junior_filtered_devtype_language_df, junior_UX_UI_Designer_df, 'Language', 'primary_proglang', 'UX / UI Designer', 'Count')

update_counts(middle_filtered_devtype_language_df, middle_data_business_analyst_df, 'Language', 'primary_proglang', 'Data or business analyst', 'Count')
update_counts(middle_filtered_devtype_language_df, middle_database_administrator_df, 'Language', 'primary_proglang', 'Database administrator', 'Count')
update_counts(middle_filtered_devtype_language_df, middle_developer_advocate_df, 'Language', 'primary_proglang', 'Developer Advocate', 'Count')
update_counts(middle_filtered_devtype_language_df, middle_Developer_back_end_df, 'Language', 'primary_proglang', 'Developer, back-end', 'Count')
update_counts(middle_filtered_devtype_language_df, middle_Developer_front_end_df, 'Language', 'primary_proglang', 'Developer, front-end', 'Count')
update_counts(middle_filtered_devtype_language_df, middle_Developer_full_stack_df, 'Language', 'primary_proglang', 'Developer, full-stack', 'Count')
update_counts(middle_filtered_devtype_language_df, middle_Developer_mobile_df, 'Language', 'primary_proglang', 'Developer, mobile', 'Count')
update_counts(middle_filtered_devtype_language_df, middle_DevOps_specialist_df, 'Language', 'primary_proglang', 'DevOps specialist', 'Count')
update_counts(middle_filtered_devtype_language_df, middle_UX_UI_Designer_df, 'Language', 'primary_proglang', 'UX / UI Designer', 'Count')

update_counts(senior_filtered_devtype_language_df, senior_data_business_analyst_df, 'Language', 'primary_proglang', 'Data or business analyst', 'Count')
update_counts(senior_filtered_devtype_language_df, senior_database_administrator_df, 'Language', 'primary_proglang', 'Database administrator', 'Count')
update_counts(senior_filtered_devtype_language_df, senior_developer_advocate_df, 'Language', 'primary_proglang', 'Developer Advocate', 'Count')
update_counts(senior_filtered_devtype_language_df, senior_Developer_back_end_df, 'Language', 'primary_proglang', 'Developer, back-end', 'Count')
update_counts(senior_filtered_devtype_language_df, senior_Developer_front_end_df, 'Language', 'primary_proglang', 'Developer, front-end', 'Count')
update_counts(senior_filtered_devtype_language_df, senior_Developer_full_stack_df, 'Language', 'primary_proglang', 'Developer, full-stack', 'Count')
update_counts(senior_filtered_devtype_language_df, senior_Developer_mobile_df, 'Language', 'primary_proglang', 'Developer, mobile', 'Count')
update_counts(senior_filtered_devtype_language_df, senior_DevOps_specialist_df, 'Language', 'primary_proglang', 'DevOps specialist', 'Count')
update_counts(senior_filtered_devtype_language_df, senior_UX_UI_Designer_df, 'Language', 'primary_proglang', 'UX / UI Designer', 'Count')

save_grouped_data_to_csv(junior_filtered_devtype_language_df, 'DevType', ['DevType', 'YearsGroup'], 'junior')
save_grouped_data_to_csv(middle_filtered_devtype_language_df, 'DevType', ['DevType', 'YearsGroup'], 'middle')
save_grouped_data_to_csv(senior_filtered_devtype_language_df, 'DevType', ['DevType', 'YearsGroup'], 'senior')

24/06/16 13:02:42 WARN Utils: Your hostname, airlab1tb-MS-7C94 resolves to a loopback address: 127.0.1.1; using 163.180.143.49 instead (on interface enp42s0)
24/06/16 13:02:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/16 13:02:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/16 13:02:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/06/16 13:02:42 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/06/16 13:02:42 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/06/16 13:02:42 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
24/06/16 13:02:42 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempti

name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name :  Developer, back-end
name :  Developer, front-end
name :  Developer, full-stack
name :  Developer, game or graphics
name :  Developer, mobile
name :  Engineer, data
name :  Product manager
name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name :  Developer, back-end
name :  Developer, front-end
name :  Developer, full-stack
name :  Developer, game or graphics
name :  Developer, mobile
name :  Engineer, data
name :  Product manager
name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name 

### Database

In [5]:
# Define file paths
junior_path = "../stak_overflow_processing/junior_devtype_database.csv"
middle_path = "../stak_overflow_processing/middle_devtype_database.csv"
senior_path = "../stak_overflow_processing/senior_devtype_database.csv"

# Read CSV files into PySpark DataFrames
junior_devtype_database_df = spark.read.csv(junior_path, header=True)
middle_devtype_database_df = spark.read.csv(middle_path, header=True)
senior_devtype_database_df = spark.read.csv(senior_path, header=True)

# 정의한 직업 리스트에 속하는 자료만 filter
junior_filtered_devtype_database_df = junior_devtype_database_df.filter(col('DevType').isin(filtered_job_list))
middle_filtered_devtype_database_df = middle_devtype_database_df.filter(col('DevType').isin(filtered_job_list))
senior_filtered_devtype_database_df = senior_devtype_database_df.filter(col('DevType').isin(filtered_job_list))


junior_data_business_analyst_df=spark.read.csv('../created_csv/junior/junior_Data-or-business-analyst_db_percentages_df.csv', header=True)
junior_database_administrator_df=spark.read.csv('../created_csv/junior/junior_Database-administrator_db_percentages_df.csv', header=True)
junior_developer_advocate_df=spark.read.csv('../created_csv/junior/junior_Developer-Advocate_db_percentages_df.csv', header=True)
junior_Developer_back_end_df=spark.read.csv('../created_csv/junior/junior_Developer-back-end_db_percentages_df.csv', header=True)
junior_Developer_front_end_df=spark.read.csv('../created_csv/junior/junior_Developer-front-end_db_percentages_df.csv', header=True)
junior_Developer_full_stack_df=spark.read.csv('../created_csv/junior/junior_Developer-full-stack_db_percentages_df.csv', header=True)
junior_Developer_mobile_df=spark.read.csv('../created_csv/junior/junior_Developer-mobile_db_percentages_df.csv', header=True)
junior_DevOps_specialist_df=spark.read.csv('../created_csv/junior/junior_DevOps-specialist_db_percentages_df.csv', header=True)
junior_UX_UI_Designer_df=spark.read.csv('../created_csv/junior/junior_UX-UI-Designer_db_percentages_df.csv', header=True)

middle_data_business_analyst_df=spark.read.csv('../created_csv/middle/middle_Data-or-business-analyst_db_percentages_df.csv', header=True)
middle_database_administrator_df=spark.read.csv('../created_csv/middle/middle_Database-administrator_db_percentages_df.csv', header=True)
middle_developer_advocate_df=spark.read.csv('../created_csv/middle/middle_Developer-Advocate_db_percentages_df.csv', header=True)
middle_Developer_back_end_df=spark.read.csv('../created_csv/middle/middle_Developer-back-end_db_percentages_df.csv', header=True)
middle_Developer_front_end_df=spark.read.csv('../created_csv/middle/middle_Developer-front-end_db_percentages_df.csv', header=True)
middle_Developer_full_stack_df=spark.read.csv('../created_csv/middle/middle_Developer-full-stack_db_percentages_df.csv', header=True)
middle_Developer_mobile_df=spark.read.csv('../created_csv/middle/middle_Developer-mobile_db_percentages_df.csv', header=True)
middle_DevOps_specialist_spark=spark.read.csv('../created_csv/middle/middle_DevOps-specialist_db_percentages_df.csv', header=True)
middle_UX_UI_Designer_df=spark.read.csv('../created_csv/middle/middle_UX-UI-Designer_db_percentages_df.csv', header=True)

senior_data_business_analyst_df=spark.read.csv('../created_csv/senior/senior_Data-or-business-analyst_db_percentages_df.csv', header=True)
senior_database_administrator_df=spark.read.csv('../created_csv/senior/senior_Database-administrator_db_percentages_df.csv', header=True)
senior_developer_advocate_df=spark.read.csv('../created_csv/senior/senior_Developer-Advocate_db_percentages_df.csv', header=True)
senior_Developer_back_end_df=spark.read.csv('../created_csv/senior/senior_Developer-back-end_db_percentages_df.csv', header=True)
senior_Developer_front_end_df=spark.read.csv('../created_csv/senior/senior_Developer-front-end_db_percentages_df.csv', header=True)
senior_Developer_full_stack_df=spark.read.csv('../created_csv/senior/senior_Developer-full-stack_db_percentages_df.csv', header=True)
senior_Developer_mobile_df=spark.read.csv('../created_csv/senior/senior_Developer-mobile_db_percentages_df.csv', header=True)
senior_DevOps_specialist_df=spark.read.csv('../created_csv/senior/senior_DevOps-specialist_db_percentages_df.csv', header=True)
senior_UX_UI_Designer_df=spark.read.csv('../created_csv/senior/senior_UX-UI-Designer_db_percentages_df.csv', header=True)

update_counts(junior_filtered_devtype_database_df, junior_data_business_analyst_df, 'Database', 'db', 'Data or business analyst', 'Count')
update_counts(junior_filtered_devtype_database_df, junior_database_administrator_df, 'Database', 'db', 'Database administrator', 'Count')
update_counts(junior_filtered_devtype_database_df, junior_developer_advocate_df, 'Database', 'db', 'Developer Advocate', 'Count')
update_counts(junior_filtered_devtype_database_df, junior_Developer_back_end_df, 'Database', 'db', 'Developer, back-end', 'Count')
update_counts(junior_filtered_devtype_database_df, junior_Developer_front_end_df, 'Database', 'db', 'Developer, front-end', 'Count')
update_counts(junior_filtered_devtype_database_df, junior_Developer_full_stack_df, 'Database', 'db', 'Developer, full-stack', 'Count')
update_counts(junior_filtered_devtype_database_df, junior_Developer_mobile_df, 'Database', 'db', 'Developer, mobile', 'Count')
update_counts(junior_filtered_devtype_database_df, junior_DevOps_specialist_df, 'Database', 'db', 'DevOps specialist', 'Count')
update_counts(junior_filtered_devtype_database_df, junior_UX_UI_Designer_df, 'Database', 'db', 'UX / UI Designer', 'Count')

update_counts(middle_filtered_devtype_database_df, middle_data_business_analyst_df, 'Database', 'db', 'Data or business analyst', 'Count')
update_counts(middle_filtered_devtype_database_df, middle_database_administrator_df, 'Database', 'db', 'Database administrator', 'Count')
update_counts(middle_filtered_devtype_database_df, middle_developer_advocate_df, 'Database', 'db', 'Developer Advocate', 'Count')
update_counts(middle_filtered_devtype_database_df, middle_Developer_back_end_df, 'Database', 'db', 'Developer, back-end', 'Count')
update_counts(middle_filtered_devtype_database_df, middle_Developer_front_end_df, 'Database', 'db', 'Developer, front-end', 'Count')
update_counts(middle_filtered_devtype_database_df, middle_Developer_full_stack_df, 'Database', 'db', 'Developer, full-stack', 'Count')
update_counts(middle_filtered_devtype_database_df, middle_Developer_mobile_df, 'Database', 'db', 'Developer, mobile', 'Count')
update_counts(middle_filtered_devtype_database_df, middle_DevOps_specialist_df, 'Database', 'db', 'DevOps specialist', 'Count')
update_counts(middle_filtered_devtype_database_df, middle_UX_UI_Designer_df, 'Database', 'db', 'UX / UI Designer', 'Count')

update_counts(senior_filtered_devtype_database_df, senior_data_business_analyst_df, 'Database', 'db', 'Data or business analyst', 'Count')
update_counts(senior_filtered_devtype_database_df, senior_database_administrator_df, 'Database', 'db', 'Database administrator', 'Count')
update_counts(senior_filtered_devtype_database_df, senior_developer_advocate_df, 'Database', 'db', 'Developer Advocate', 'Count')
update_counts(senior_filtered_devtype_database_df, senior_Developer_back_end_df, 'Database', 'db', 'Developer, back-end', 'Count')
update_counts(senior_filtered_devtype_database_df, senior_Developer_front_end_df, 'Database', 'db', 'Developer, front-end', 'Count')
update_counts(senior_filtered_devtype_database_df, senior_Developer_full_stack_df, 'Database', 'db', 'Developer, full-stack', 'Count')
update_counts(senior_filtered_devtype_database_df, senior_Developer_mobile_df, 'Database', 'db', 'Developer, mobile', 'Count')
update_counts(senior_filtered_devtype_database_df, senior_DevOps_specialist_df, 'Database', 'db', 'DevOps specialist', 'Count')
update_counts(senior_filtered_devtype_database_df, senior_UX_UI_Designer_df, 'Database', 'db', 'UX / UI Designer', 'Count')

save_grouped_data_to_csv(junior_filtered_devtype_database_df, 'DevType', ['DevType', 'YearsGroup'], 'junior')
save_grouped_data_to_csv(middle_filtered_devtype_database_df, 'DevType', ['DevType', 'YearsGroup'], 'middle')
save_grouped_data_to_csv(senior_filtered_devtype_database_df, 'DevType', ['DevType', 'YearsGroup'], 'senior')

name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name :  Developer, back-end
name :  Developer, front-end
name :  Developer, full-stack
name :  Developer, game or graphics
name :  Developer, mobile
name :  Engineer, data
name :  Product manager
name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name :  Developer, back-end
name :  Developer, front-end
name :  Developer, full-stack
name :  Developer, game or graphics
name :  Developer, mobile
name :  Engineer, data
name :  Product manager
name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name 

### Framework

In [6]:
junior_devtype_framework_df = spark.read.csv("../stak_overflow_processing/junior_devtype_framework.csv", header=True)
middle_devtype_framework_df = spark.read.csv("../stak_overflow_processing/middle_devtype_framework.csv", header=True)
senior_devtype_framework_df = spark.read.csv("../stak_overflow_processing/senior_devtype_framework.csv", header=True)

# 정의한 직업 리스트에 속하는 자료만 filter
junior_filtered_devtype_framework_df = junior_devtype_framework_df.filter(col('DevType').isin(filtered_job_list))
middle_filtered_devtype_framework_df = middle_devtype_framework_df.filter(col('DevType').isin(filtered_job_list))
senior_filtered_devtype_framework_df = senior_devtype_framework_df.filter(col('DevType').isin(filtered_job_list))

junior_data_business_analyst_df=spark.read.csv('../created_csv/junior/junior_Data-or-business-analyst_ide_main_percentages_df.csv', header=True)
junior_database_administrator_df=spark.read.csv('../created_csv/junior/junior_Database-administrator_ide_main_percentages_df.csv', header=True)
junior_developer_advocate_df=spark.read.csv('../created_csv/junior/junior_Developer-Advocate_ide_main_percentages_df.csv', header=True)
junior_Developer_back_end_df=spark.read.csv('../created_csv/junior/junior_Developer-back-end_ide_main_percentages_df.csv', header=True)
junior_Developer_front_end_df=spark.read.csv('../created_csv/junior/junior_Developer-front-end_ide_main_percentages_df.csv', header=True)
junior_Developer_full_stack_df=spark.read.csv('../created_csv/junior/junior_Developer-full-stack_ide_main_percentages_df.csv', header=True)
junior_Developer_mobile_df=spark.read.csv('../created_csv/junior/junior_Developer-mobile_ide_main_percentages_df.csv', header=True)
junior_DevOps_specialist_df=spark.read.csv('../created_csv/junior/junior_DevOps-specialist_ide_main_percentages_df.csv', header=True)
junior_UX_UI_Designer_df=spark.read.csv('../created_csv/junior/junior_UX-UI-Designer_ide_main_percentages_df.csv', header=True)

middle_data_business_analyst_df=spark.read.csv('../created_csv/middle/middle_Data-or-business-analyst_ide_main_percentages_df.csv', header=True)
middle_database_administrator_df=spark.read.csv('../created_csv/middle/middle_Database-administrator_ide_main_percentages_df.csv', header=True)
middle_developer_advocate_df=spark.read.csv('../created_csv/middle/middle_Developer-Advocate_ide_main_percentages_df.csv', header=True)
middle_Developer_back_end_df=spark.read.csv('../created_csv/middle/middle_Developer-back-end_ide_main_percentages_df.csv', header=True)
middle_Developer_front_end_df=spark.read.csv('../created_csv/middle/middle_Developer-front-end_ide_main_percentages_df.csv', header=True)
middle_Developer_full_stack_df=spark.read.csv('../created_csv/middle/middle_Developer-full-stack_ide_main_percentages_df.csv', header=True)
middle_Developer_mobile_df=spark.read.csv('../created_csv/middle/middle_Developer-mobile_ide_main_percentages_df.csv', header=True)
middle_DevOps_specialist_df=spark.read.csv('../created_csv/middle/middle_DevOps-specialist_ide_main_percentages_df.csv', header=True)
middle_UX_UI_Designer_df=spark.read.csv('../created_csv/middle/middle_UX-UI-Designer_ide_main_percentages_df.csv', header=True)

senior_data_business_analyst_df=spark.read.csv('../created_csv/senior/senior_Data-or-business-analyst_ide_main_percentages_df.csv', header=True)
senior_database_administrator_df=spark.read.csv('../created_csv/senior/senior_Database-administrator_ide_main_percentages_df.csv', header=True)
senior_developer_advocate_df=spark.read.csv('../created_csv/senior/senior_Developer-Advocate_ide_main_percentages_df.csv', header=True)
senior_Developer_back_end_df=spark.read.csv('../created_csv/senior/senior_Developer-back-end_ide_main_percentages_df.csv', header=True)
senior_Developer_front_end_df=spark.read.csv('../created_csv/senior/senior_Developer-front-end_ide_main_percentages_df.csv', header=True)
senior_Developer_full_stack_df=spark.read.csv('../created_csv/senior/senior_Developer-full-stack_ide_main_percentages_df.csv', header=True)
senior_Developer_mobile_df=spark.read.csv('../created_csv/senior/senior_Developer-mobile_ide_main_percentages_df.csv', header=True)
senior_DevOps_specialist_df=spark.read.csv('../created_csv/senior/senior_DevOps-specialist_ide_main_percentages_df.csv', header=True)
senior_UX_UI_Designer_df=spark.read.csv('../created_csv/senior/senior_UX-UI-Designer_ide_main_percentages_df.csv', header=True)

update_counts(junior_filtered_devtype_framework_df, junior_data_business_analyst_df, 'Webframe', 'ide_main', 'Data or business analyst', 'Count')
update_counts(junior_filtered_devtype_framework_df, junior_database_administrator_df, 'Webframe', 'ide_main', 'Database administrator', 'Count')
update_counts(junior_filtered_devtype_framework_df, junior_developer_advocate_df, 'Webframe', 'ide_main', 'Developer Advocate', 'Count')
update_counts(junior_filtered_devtype_framework_df, junior_Developer_back_end_df, 'Webframe', 'ide_main', 'Developer, back-end', 'Count')
update_counts(junior_filtered_devtype_framework_df, junior_Developer_front_end_df, 'Webframe', 'ide_main', 'Developer, front-end', 'Count')
update_counts(junior_filtered_devtype_framework_df, junior_Developer_full_stack_df, 'Webframe', 'ide_main', 'Developer, full-stack', 'Count')
update_counts(junior_filtered_devtype_framework_df, junior_Developer_mobile_df, 'Webframe', 'ide_main', 'Developer, mobile', 'Count')
update_counts(junior_filtered_devtype_framework_df, junior_DevOps_specialist_df, 'Webframe', 'ide_main', 'DevOps specialist', 'Count')
update_counts(junior_filtered_devtype_framework_df, junior_UX_UI_Designer_df, 'Webframe', 'ide_main', 'UX / UI Designer', 'Count')

update_counts(middle_filtered_devtype_framework_df, middle_data_business_analyst_df, 'Webframe', 'ide_main', 'Data or business analyst', 'Count')
update_counts(middle_filtered_devtype_framework_df, middle_database_administrator_df, 'Webframe', 'ide_main', 'Database administrator', 'Count')
update_counts(middle_filtered_devtype_framework_df, middle_developer_advocate_df, 'Webframe', 'ide_main', 'Developer Advocate', 'Count')
update_counts(middle_filtered_devtype_framework_df, middle_Developer_back_end_df, 'Webframe', 'ide_main', 'Developer, back-end', 'Count')
update_counts(middle_filtered_devtype_framework_df, middle_Developer_front_end_df, 'Webframe', 'ide_main', 'Developer, front-end', 'Count')
update_counts(middle_filtered_devtype_framework_df, middle_Developer_full_stack_df, 'Webframe', 'ide_main', 'Developer, full-stack', 'Count')
update_counts(middle_filtered_devtype_framework_df, middle_Developer_mobile_df, 'Webframe', 'ide_main', 'Developer, mobile', 'Count')
update_counts(middle_filtered_devtype_framework_df, middle_DevOps_specialist_df, 'Webframe', 'ide_main', 'DevOps specialist', 'Count')
update_counts(middle_filtered_devtype_framework_df, middle_UX_UI_Designer_df, 'Webframe', 'ide_main', 'UX / UI Designer', 'Count')

update_counts(senior_filtered_devtype_framework_df, senior_data_business_analyst_df, 'Webframe', 'ide_main', 'Data or business analyst', 'Count')
update_counts(senior_filtered_devtype_framework_df, senior_database_administrator_df, 'Webframe', 'ide_main', 'Database administrator', 'Count')
update_counts(senior_filtered_devtype_framework_df, senior_developer_advocate_df, 'Webframe', 'ide_main', 'Developer Advocate', 'Count')
update_counts(senior_filtered_devtype_framework_df, senior_Developer_back_end_df, 'Webframe', 'ide_main', 'Developer, back-end', 'Count')
update_counts(senior_filtered_devtype_framework_df, senior_Developer_front_end_df, 'Webframe', 'ide_main', 'Developer, front-end', 'Count')
update_counts(senior_filtered_devtype_framework_df, senior_Developer_full_stack_df, 'Webframe', 'ide_main', 'Developer, full-stack', 'Count')
update_counts(senior_filtered_devtype_framework_df, senior_Developer_mobile_df, 'Webframe', 'ide_main', 'Developer, mobile', 'Count')
update_counts(senior_filtered_devtype_framework_df, senior_DevOps_specialist_df, 'Webframe', 'ide_main', 'DevOps specialist', 'Count')
update_counts(senior_filtered_devtype_framework_df, senior_UX_UI_Designer_df, 'Webframe', 'ide_main', 'UX / UI Designer', 'Count')

save_grouped_data_to_csv(junior_filtered_devtype_framework_df, 'DevType', ['DevType', 'YearsGroup'], 'junior')
save_grouped_data_to_csv(middle_filtered_devtype_framework_df, 'DevType', ['DevType', 'YearsGroup'], 'middle')
save_grouped_data_to_csv(senior_filtered_devtype_framework_df, 'DevType', ['DevType', 'YearsGroup'], 'senior')

name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name :  Developer, back-end
name :  Developer, front-end
name :  Developer, full-stack
name :  Developer, game or graphics
name :  Developer, mobile
name :  Engineer, data
name :  Product manager
name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name :  Developer, back-end
name :  Developer, front-end
name :  Developer, full-stack
name :  Developer, game or graphics
name :  Developer, mobile
name :  Engineer, data
name :  Product manager
name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name 

### Idle

In [7]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("stackoverflow_processing") \
    .getOrCreate()


junior_devtype_tool_df = spark.read.csv("../stak_overflow_processing/junior_devtype_tools.csv",header=True)
middle_devtype_tool_df = spark.read.csv("../stak_overflow_processing/middle_devtype_tools.csv",header=True)
senior_devtype_tool_df = spark.read.csv("../stak_overflow_processing/senior_devtype_tools.csv",header=True)


# 정의한 직업 리스트에 속하는 자료만 filter
junior_filtered_devtype_tool_df = junior_devtype_tool_df.filter(col('DevType').isin(filtered_job_list))
middle_filtered_devtype_tool_df = middle_devtype_tool_df.filter(col('DevType').isin(filtered_job_list))
senior_filtered_devtype_tool_df = senior_devtype_tool_df.filter(col('DevType').isin(filtered_job_list))


junior_data_business_analyst_df=spark.read.csv('../created_csv/junior/junior_Data-or-business-analyst_ide_main_percentages_df.csv',header=True)
junior_database_administrator_df=spark.read.csv('../created_csv/junior/junior_Database-administrator_ide_main_percentages_df.csv',header=True)
junior_developer_advocate_df=spark.read.csv('../created_csv/junior/junior_Developer-Advocate_ide_main_percentages_df.csv',header=True)
junior_Developer_back_end_df=spark.read.csv('../created_csv/junior/junior_Developer-back-end_ide_main_percentages_df.csv',header=True)
junior_Developer_front_end_df=spark.read.csv('../created_csv/junior/junior_Developer-front-end_ide_main_percentages_df.csv',header=True)
junior_Developer_full_stack_df=spark.read.csv('../created_csv/junior/junior_Developer-full-stack_ide_main_percentages_df.csv',header=True)
junior_Developer_mobile_df=spark.read.csv('../created_csv/junior/junior_Developer-mobile_ide_main_percentages_df.csv',header=True)
junior_DevOps_specialist_df=spark.read.csv('../created_csv/junior/junior_DevOps-specialist_ide_main_percentages_df.csv',header=True)
junior_UX_UI_Designer_df=spark.read.csv('../created_csv/junior/junior_UX-UI-Designer_ide_main_percentages_df.csv',header=True)

middle_data_business_analyst_df=spark.read.csv('../created_csv/middle/middle_Data-or-business-analyst_ide_main_percentages_df.csv',header=True)
middle_database_administrator_df=spark.read.csv('../created_csv/middle/middle_Database-administrator_ide_main_percentages_df.csv',header=True)
middle_developer_advocate_df=spark.read.csv('../created_csv/middle/middle_Developer-Advocate_ide_main_percentages_df.csv',header=True)
middle_Developer_back_end_df=spark.read.csv('../created_csv/middle/middle_Developer-back-end_ide_main_percentages_df.csv',header=True)
middle_Developer_front_end_df=spark.read.csv('../created_csv/middle/middle_Developer-front-end_ide_main_percentages_df.csv',header=True)
middle_Developer_full_stack_df=spark.read.csv('../created_csv/middle/middle_Developer-full-stack_ide_main_percentages_df.csv',header=True)
middle_Developer_mobile_df=spark.read.csv('../created_csv/middle/middle_Developer-mobile_ide_main_percentages_df.csv',header=True)
middle_DevOps_specialist_df=spark.read.csv('../created_csv/middle/middle_DevOps-specialist_ide_main_percentages_df.csv',header=True)
middle_UX_UI_Designer_df=spark.read.csv('../created_csv/middle/middle_UX-UI-Designer_ide_main_percentages_df.csv',header=True)

senior_data_business_analyst_df=spark.read.csv('../created_csv/senior/senior_Data-or-business-analyst_ide_main_percentages_df.csv',header=True)
senior_database_administrator_df=spark.read.csv('../created_csv/senior/senior_Database-administrator_ide_main_percentages_df.csv',header=True)
senior_developer_advocate_df=spark.read.csv('../created_csv/senior/senior_Developer-Advocate_ide_main_percentages_df.csv',header=True)
senior_Developer_back_end_df=spark.read.csv('../created_csv/senior/senior_Developer-back-end_ide_main_percentages_df.csv',header=True)
senior_Developer_front_end_df=spark.read.csv('../created_csv/senior/senior_Developer-front-end_ide_main_percentages_df.csv',header=True)
senior_Developer_full_stack_df=spark.read.csv('../created_csv/senior/senior_Developer-full-stack_ide_main_percentages_df.csv',header=True)
senior_Developer_mobile_df=spark.read.csv('../created_csv/senior/senior_Developer-mobile_ide_main_percentages_df.csv',header=True)
senior_DevOps_specialist_df=spark.read.csv('../created_csv/senior/senior_DevOps-specialist_ide_main_percentages_df.csv',header=True)
senior_UX_UI_Designer_df=spark.read.csv('../created_csv/senior/senior_UX-UI-Designer_ide_main_percentages_df.csv',header=True)

update_counts(junior_filtered_devtype_tool_df, junior_data_business_analyst_df, 'Tools', 'ide_main', 'Data or business analyst', 'Count')
update_counts(junior_filtered_devtype_tool_df, junior_database_administrator_df, 'Tools', 'ide_main', 'Database administrator', 'Count')
update_counts(junior_filtered_devtype_tool_df, junior_developer_advocate_df, 'Tools', 'ide_main', 'Developer Advocate', 'Count')
update_counts(junior_filtered_devtype_tool_df, junior_Developer_back_end_df, 'Tools', 'ide_main', 'Developer, back-end', 'Count')
update_counts(junior_filtered_devtype_tool_df, junior_Developer_front_end_df, 'Tools', 'ide_main', 'Developer, front-end', 'Count')
update_counts(junior_filtered_devtype_tool_df, junior_Developer_full_stack_df, 'Tools', 'ide_main', 'Developer, full-stack', 'Count')
update_counts(junior_filtered_devtype_tool_df, junior_Developer_mobile_df, 'Tools', 'ide_main', 'Developer, mobile', 'Count')
update_counts(junior_filtered_devtype_tool_df, junior_DevOps_specialist_df, 'Tools', 'ide_main', 'DevOps specialist', 'Count')
update_counts(junior_filtered_devtype_tool_df, junior_UX_UI_Designer_df, 'Tools', 'ide_main', 'UX / UI Designer', 'Count')

update_counts(middle_filtered_devtype_tool_df, middle_data_business_analyst_df, 'Tools', 'ide_main', 'Data or business analyst', 'Count')
update_counts(middle_filtered_devtype_tool_df, middle_database_administrator_df, 'Tools', 'ide_main', 'Database administrator', 'Count')
update_counts(middle_filtered_devtype_tool_df, middle_developer_advocate_df, 'Tools', 'ide_main', 'Developer Advocate', 'Count')
update_counts(middle_filtered_devtype_tool_df, middle_Developer_back_end_df, 'Tools', 'ide_main', 'Developer, back-end', 'Count')
update_counts(middle_filtered_devtype_tool_df, middle_Developer_front_end_df, 'Tools', 'ide_main', 'Developer, front-end', 'Count')
update_counts(middle_filtered_devtype_tool_df, middle_Developer_full_stack_df, 'Tools', 'ide_main', 'Developer, full-stack', 'Count')
update_counts(middle_filtered_devtype_tool_df, middle_Developer_mobile_df, 'Tools', 'ide_main', 'Developer, mobile', 'Count')
update_counts(middle_filtered_devtype_tool_df, middle_DevOps_specialist_df, 'Tools', 'ide_main', 'DevOps specialist', 'Count')
update_counts(middle_filtered_devtype_tool_df, middle_UX_UI_Designer_df, 'Tools', 'ide_main', 'UX / UI Designer', 'Count')

update_counts(senior_filtered_devtype_tool_df, senior_data_business_analyst_df, 'Tools', 'ide_main', 'Data or business analyst', 'Count')
update_counts(senior_filtered_devtype_tool_df, senior_database_administrator_df, 'Tools', 'ide_main', 'Database administrator', 'Count')
update_counts(senior_filtered_devtype_tool_df, senior_developer_advocate_df, 'Tools', 'ide_main', 'Developer Advocate', 'Count')
update_counts(senior_filtered_devtype_tool_df, senior_Developer_back_end_df, 'Tools', 'ide_main', 'Developer, back-end', 'Count')
update_counts(senior_filtered_devtype_tool_df, senior_Developer_front_end_df, 'Tools', 'ide_main', 'Developer, front-end', 'Count')
update_counts(senior_filtered_devtype_tool_df, senior_Developer_full_stack_df, 'Tools', 'ide_main', 'Developer, full-stack', 'Count')
update_counts(senior_filtered_devtype_tool_df, senior_Developer_mobile_df, 'Tools', 'ide_main', 'Developer, mobile', 'Count')
update_counts(senior_filtered_devtype_tool_df, senior_DevOps_specialist_df, 'Tools', 'ide_main', 'DevOps specialist', 'Count')
update_counts(senior_filtered_devtype_tool_df, senior_UX_UI_Designer_df, 'Tools', 'ide_main', 'UX / UI Designer', 'Count')

save_grouped_data_to_csv(junior_filtered_devtype_tool_df, 'DevType', ['DevType', 'YearsGroup'], 'junior')
save_grouped_data_to_csv(middle_filtered_devtype_tool_df, 'DevType', ['DevType', 'YearsGroup'], 'middle')
save_grouped_data_to_csv(senior_filtered_devtype_tool_df, 'DevType', ['DevType', 'YearsGroup'], 'senior')

name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name :  Developer, back-end
name :  Developer, front-end
name :  Developer, full-stack
name :  Developer, game or graphics
name :  Developer, mobile
name :  Engineer, data
name :  Product manager
name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name :  Developer, back-end
name :  Developer, front-end
name :  Developer, full-stack
name :  Developer, game or graphics
name :  Developer, mobile
name :  Engineer, data
name :  Product manager
name :  Data or business analyst
name :  Data scientist or machine learning specialist
name :  Database administrator
name :  DevOps specialist
name :  Developer Advocate
name :  Developer, QA or test
name 

## Job_code

In [8]:
junior_data_business_analyst_df=spark.read.csv('../created_csv/junior/junior_Data-or-business-analyst_job_code_percentages_df.csv')
junior_database_administrator_df=spark.read.csv('../created_csv/junior/junior_Database-administrator_job_code_percentages_df.csv')
junior_developer_advocate_df=spark.read.csv('../created_csv/junior/junior_Developer-Advocate_job_code_percentages_df.csv')
junior_Developer_back_end_df=spark.read.csv('../created_csv/junior/junior_Developer-back-end_job_code_percentages_df.csv')
junior_Developer_front_end_df=spark.read.csv('../created_csv/junior/junior_Developer-front-end_job_code_percentages_df.csv')
junior_Developer_full_stack_df=spark.read.csv('../created_csv/junior/junior_Developer-full-stack_job_code_percentages_df.csv')
junior_Developer_mobile_df=spark.read.csv('../created_csv/junior/junior_Developer-mobile_job_code_percentages_df.csv')
junior_DevOps_specialist_df=spark.read.csv('../created_csv/junior/junior_DevOps-specialist_job_code_percentages_df.csv')
junior_UX_UI_Designer_df=spark.read.csv('../created_csv/junior/junior_UX-UI-Designer_job_code_percentages_df.csv')

middle_data_business_analyst_df=spark.read.csv('../created_csv/middle/middle_Data-or-business-analyst_job_code_percentages_df.csv')
middle_database_administrator_df=spark.read.csv('../created_csv/middle/middle_Database-administrator_job_code_percentages_df.csv')
middle_developer_advocate_df=spark.read.csv('../created_csv/middle/middle_Developer-Advocate_job_code_percentages_df.csv')
middle_Developer_back_end_df=spark.read.csv('../created_csv/middle/middle_Developer-back-end_job_code_percentages_df.csv')
middle_Developer_front_end_df=spark.read.csv('../created_csv/middle/middle_Developer-front-end_job_code_percentages_df.csv')
middle_Developer_full_stack_df=spark.read.csv('../created_csv/middle/middle_Developer-full-stack_job_code_percentages_df.csv')
middle_Developer_mobile_df=spark.read.csv('../created_csv/middle/middle_Developer-mobile_job_code_percentages_df.csv')
middle_DevOps_specialist_df=spark.read.csv('../created_csv/middle/middle_DevOps-specialist_job_code_percentages_df.csv')
middle_UX_UI_Designer_df=spark.read.csv('../created_csv/middle/middle_UX-UI-Designer_job_code_percentages_df.csv')

senior_data_business_analyst_df=spark.read.csv('../created_csv/senior/senior_Data-or-business-analyst_job_code_percentages_df.csv')
senior_database_administrator_df=spark.read.csv('../created_csv/senior/senior_Database-administrator_job_code_percentages_df.csv')
senior_developer_advocate_df=spark.read.csv('../created_csv/senior/senior_Developer-Advocate_job_code_percentages_df.csv')
senior_Developer_back_end_df=spark.read.csv('../created_csv/senior/senior_Developer-back-end_job_code_percentages_df.csv')
senior_Developer_front_end_df=spark.read.csv('../created_csv/senior/senior_Developer-front-end_job_code_percentages_df.csv')
senior_Developer_full_stack_df=spark.read.csv('../created_csv/senior/senior_Developer-full-stack_job_code_percentages_df.csv')
senior_Developer_mobile_df=spark.read.csv('../created_csv/senior/senior_Developer-mobile_job_code_percentages_df.csv')
senior_DevOps_specialist_df=spark.read.csv('../created_csv/senior/senior_DevOps-specialist_job_code_percentages_df.csv')
senior_UX_UI_Designer_df=spark.read.csv('../created_csv/senior/senior_UX-UI-Designer_job_code_percentages_df.csv')

transform_and_save_data(junior_data_business_analyst_df, 'junior', 'junior_Data-or-business-analyst_job_code_percentages_df')
transform_and_save_data(junior_database_administrator_df, 'junior', 'junior_Database-administrator_job_code_percentages_df')
transform_and_save_data(junior_developer_advocate_df, 'junior', 'junior_Developer-Advocate_job_code_percentages_df')
transform_and_save_data(junior_Developer_back_end_df, 'junior', 'junior_Developer-back-end_job_code_percentages_df')
transform_and_save_data(junior_Developer_front_end_df, 'junior', 'junior_Developer-front-end_job_code_percentages_df')
transform_and_save_data(junior_Developer_full_stack_df, 'junior', 'junior_Developer-full-stack_job_code_percentages_df')
transform_and_save_data(junior_Developer_mobile_df, 'junior', 'junior_Developer-mobile_job_code_percentages_df')
transform_and_save_data(junior_DevOps_specialist_df, 'junior', 'junior_DevOps-specialist_job_code_percentages_df')
transform_and_save_data(junior_UX_UI_Designer_df, 'junior', 'junior_UX-UI-Designer_job_code_percentages_df')

transform_and_save_data(middle_data_business_analyst_df, 'middle', 'middle_Data-or-business-analyst_job_code_percentages_df')
transform_and_save_data(middle_database_administrator_df, 'middle', 'middle_Database-administrator_job_code_percentages_df')
transform_and_save_data(middle_developer_advocate_df, 'middle', 'middle_Developer-Advocate_job_code_percentages_df')
transform_and_save_data(middle_Developer_back_end_df, 'middle', 'middle_Developer-back-end_job_code_percentages_df')
transform_and_save_data(middle_Developer_front_end_df, 'middle', 'middle_Developer-front-end_job_code_percentages_df')
transform_and_save_data(middle_Developer_full_stack_df, 'middle', 'middle_Developer-full-stack_job_code_percentages_df')
transform_and_save_data(middle_Developer_mobile_df, 'middle', 'middle_Developer-mobile_job_code_percentages_df')
transform_and_save_data(middle_DevOps_specialist_df, 'middle', 'middle_DevOps-specialist_job_code_percentages_df')
transform_and_save_data(middle_UX_UI_Designer_df, 'middle', 'middler_UX-UI-Designer_job_code_percentages_df')

transform_and_save_data(senior_data_business_analyst_df, 'senior', 'senior_Data-or-business-analyst_job_code_percentages_df')
transform_and_save_data(senior_database_administrator_df, 'senior', 'senior_Database-administrator_job_code_percentages_df')
transform_and_save_data(senior_developer_advocate_df, 'senior', 'senior_Developer-Advocate_job_code_percentages_df')
transform_and_save_data(senior_Developer_back_end_df, 'senior', 'senior_Developer-back-end_job_code_percentages_df')
transform_and_save_data(senior_Developer_front_end_df, 'senior', 'senior_Developer-front-end_job_code_percentages_df')
transform_and_save_data(senior_Developer_full_stack_df, 'senior', 'senior_Developer-full-stack_job_code_percentages_df')
transform_and_save_data(senior_Developer_mobile_df, 'senior', 'senior_Developer-mobile_job_code_percentages_df')
transform_and_save_data(senior_DevOps_specialist_df, 'senior', 'senior_DevOps-specialist_job_code_percentages_df')
transform_and_save_data(senior_UX_UI_Designer_df, 'senior', 'senior_UX-UI-Designer_job_code_percentages_df')

24/06/16 13:03:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

24/06/16 13:03:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

24/06/16 13:03:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

## lifestyle_sleep

In [10]:
junior_data_business_analyst_df=spark.read.csv('../created_csv/junior/junior_Data-or-business-analyst_lifestyle_sleep_percentages_df.csv')
junior_database_administrator_df=spark.read.csv('../created_csv/junior/junior_Database-administrator_lifestyle_sleep_percentages_df.csv')
junior_developer_advocate_df=spark.read.csv('../created_csv/junior/junior_Developer-Advocate_lifestyle_sleep_percentages_df.csv')
junior_Developer_back_end_df=spark.read.csv('../created_csv/junior/junior_Developer-back-end_lifestyle_sleep_percentages_df.csv')
junior_Developer_front_end_df=spark.read.csv('../created_csv/junior/junior_Developer-front-end_lifestyle_sleep_percentages_df.csv')
junior_Developer_full_stack_df=spark.read.csv('../created_csv/junior/junior_Developer-full-stack_lifestyle_sleep_percentages_df.csv')
junior_Developer_mobile_df=spark.read.csv('../created_csv/junior/junior_Developer-mobile_lifestyle_sleep_percentages_df.csv')
junior_DevOps_specialist_df=spark.read.csv('../created_csv/junior/junior_DevOps-specialist_lifestyle_sleep_percentages_df.csv')
junior_UX_UI_Designer_df=spark.read.csv('../created_csv/junior/junior_UX-UI-Designer_lifestyle_sleep_percentages_df.csv')

middle_data_business_analyst_df=spark.read.csv('../created_csv/middle/middle_Data-or-business-analyst_lifestyle_sleep_percentages_df.csv')
middle_database_administrator_df=spark.read.csv('../created_csv/middle/middle_Database-administrator_lifestyle_sleep_percentages_df.csv')
middle_developer_advocate_df=spark.read.csv('../created_csv/middle/middle_Developer-Advocate_lifestyle_sleep_percentages_df.csv')
middle_Developer_back_end_df=spark.read.csv('../created_csv/middle/middle_Developer-back-end_lifestyle_sleep_percentages_df.csv')
middle_Developer_front_end_df=spark.read.csv('../created_csv/middle/middle_Developer-front-end_lifestyle_sleep_percentages_df.csv')
middle_Developer_full_stack_df=spark.read.csv('../created_csv/middle/middle_Developer-full-stack_lifestyle_sleep_percentages_df.csv')
middle_Developer_mobile_df=spark.read.csv('../created_csv/middle/middle_Developer-mobile_lifestyle_sleep_percentages_df.csv')
middle_DevOps_specialist_df=spark.read.csv('../created_csv/middle/middle_DevOps-specialist_lifestyle_sleep_percentages_df.csv')
middle_UX_UI_Designer_df=spark.read.csv('../created_csv/middle/middle_UX-UI-Designer_lifestyle_sleep_percentages_df.csv')

senior_data_business_analyst_df=spark.read.csv('../created_csv/senior/senior_Data-or-business-analyst_lifestyle_sleep_percentages_df.csv')
senior_database_administrator_df=spark.read.csv('../created_csv/senior/senior_Database-administrator_lifestyle_sleep_percentages_df.csv')
senior_developer_advocate_df=spark.read.csv('../created_csv/senior/senior_Developer-Advocate_lifestyle_sleep_percentages_df.csv')
senior_Developer_back_end_df=spark.read.csv('../created_csv/senior/senior_Developer-back-end_lifestyle_sleep_percentages_df.csv')
senior_Developer_front_end_df=spark.read.csv('../created_csv/senior/senior_Developer-front-end_lifestyle_sleep_percentages_df.csv')
senior_Developer_full_stack_df=spark.read.csv('../created_csv/senior/senior_Developer-full-stack_lifestyle_sleep_percentages_df.csv')
senior_Developer_mobile_df=spark.read.csv('../created_csv/senior/senior_Developer-mobile_lifestyle_sleep_percentages_df.csv')
senior_DevOps_specialist_df=spark.read.csv('../created_csv/senior/senior_DevOps-specialist_lifestyle_sleep_percentages_df.csv')
senior_UX_UI_Designer_df=spark.read.csv('../created_csv/senior/senior_UX-UI-Designer_lifestyle_sleep_percentages_df.csv')

transform_and_save_data(junior_data_business_analyst_df, 'junior', 'junior_Data-or-business-analyst_lifestyle_sleep_percentages_df')
transform_and_save_data(junior_database_administrator_df, 'junior', 'junior_Database-administrator_lifestyle_sleep_percentages_df')
transform_and_save_data(junior_developer_advocate_df, 'junior', 'junior_Developer-Advocate_lifestyle_sleep_percentages_df')
transform_and_save_data(junior_Developer_back_end_df, 'junior', 'junior_Developer-back-end_lifestyle_sleep_percentages_df')
transform_and_save_data(junior_Developer_front_end_df, 'junior', 'junior_Developer-front-end_lifestyle_sleep_percentages_df')
transform_and_save_data(junior_Developer_full_stack_df, 'junior', 'junior_Developer-full-stack_lifestyle_sleep_percentages_df')
transform_and_save_data(junior_Developer_mobile_df, 'junior', 'junior_Developer-mobile_lifestyle_sleep_percentages_df')
transform_and_save_data(junior_DevOps_specialist_df, 'junior', 'junior_DevOps-specialist_lifestyle_sleep_percentages_df')
transform_and_save_data(junior_UX_UI_Designer_df, 'junior', 'junior_UX-UI-Designer_lifestyle_sleep_percentages_df')

transform_and_save_data(middle_data_business_analyst_df, 'middle', 'middle_Data-or-business-analyst_lifestyle_sleep_percentages_df')
transform_and_save_data(middle_database_administrator_df, 'middle', 'middle_Database-administrator_lifestyle_sleep_percentages_df')
transform_and_save_data(middle_developer_advocate_df, 'middle', 'middle_Developer-Advocate_lifestyle_sleep_percentages_df')
transform_and_save_data(middle_Developer_back_end_df, 'middle', 'middle_Developer-back-end_lifestyle_sleep_percentages_df')
transform_and_save_data(middle_Developer_front_end_df, 'middle', 'middle_Developer-front-end_lifestyle_sleep_percentages_df')
transform_and_save_data(middle_Developer_full_stack_df, 'middle', 'middle_Developer-full-stack_lifestyle_sleep_percentages_df')
transform_and_save_data(middle_Developer_mobile_df, 'middle', 'middle_Developer-mobile_lifestyle_sleep_percentages_df')
transform_and_save_data(middle_DevOps_specialist_df, 'middle', 'middle_DevOps-specialist_lifestyle_sleep_percentages_df')
transform_and_save_data(middle_UX_UI_Designer_df, 'middle', 'middle_UX-UI-Designer_lifestyle_sleep_percentages_df')

transform_and_save_data(senior_data_business_analyst_df, 'senior', 'senior_Data-or-business-analyst_lifestyle_sleep_percentages_df')
transform_and_save_data(senior_database_administrator_df, 'senior', 'senior_Database-administrator_lifestyle_sleep_percentages_df')
transform_and_save_data(senior_developer_advocate_df, 'senior', 'senior_Developer-Advocate_lifestyle_sleep_percentages_df')
transform_and_save_data(senior_Developer_back_end_df, 'senior', 'senior_Developer-back-end_lifestyle_sleep_percentages_df')
transform_and_save_data(senior_Developer_front_end_df, 'senior', 'senior_Developer-front-end_lifestyle_sleep_percentages_df')
transform_and_save_data(senior_Developer_full_stack_df, 'senior', 'senior_Developer-full-stack_lifestyle_sleep_percentages_df')
transform_and_save_data(senior_Developer_mobile_df, 'senior', 'senior_Developer-mobile_lifestyle_sleep_percentages_df')
transform_and_save_data(senior_DevOps_specialist_df, 'senior', 'senior_DevOps-specialist_lifestyle_sleep_percentages_df')
transform_and_save_data(senior_UX_UI_Designer_df, 'senior', 'senior_UX-UI-Designer_lifestyle_sleep_percentages_df')

24/06/16 13:03:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

## producive_to_job

In [11]:
junior_data_business_analyst_df=spark.read.csv('../created_csv/junior/junior_Data-or-business-analyst_productive_to_job_percentages_df.csv')
junior_database_administrator_df=spark.read.csv('../created_csv/junior/junior_Database-administrator_productive_to_job_percentages_df.csv')
junior_developer_advocate_df=spark.read.csv('../created_csv/junior/junior_Developer-Advocate_productive_to_job_percentages_df.csv')
junior_Developer_back_end_df=spark.read.csv('../created_csv/junior/junior_Developer-back-end_productive_to_job_percentages_df.csv')
junior_Developer_front_end_df=spark.read.csv('../created_csv/junior/junior_Developer-front-end_productive_to_job_percentages_df.csv')
junior_Developer_full_stack_df=spark.read.csv('../created_csv/junior/junior_Developer-full-stack_productive_to_job_percentages_df.csv')
junior_Developer_mobile_df=spark.read.csv('../created_csv/junior/junior_Developer-mobile_productive_to_job_percentages_df.csv')
junior_DevOps_specialist_df=spark.read.csv('../created_csv/junior/junior_DevOps-specialist_productive_to_job_percentages_df.csv')
junior_UX_UI_Designer_df=spark.read.csv('../created_csv/junior/junior_UX-UI-Designer_productive_to_job_percentages_df.csv')

middle_data_business_analyst_df=spark.read.csv('../created_csv/middle/middle_Data-or-business-analyst_productive_to_job_percentages_df.csv')
middle_database_administrator_df=spark.read.csv('../created_csv/middle/middle_Database-administrator_productive_to_job_percentages_df.csv')
middle_developer_advocate_df=spark.read.csv('../created_csv/middle/middle_Developer-Advocate_productive_to_job_percentages_df.csv')
middle_Developer_back_end_df=spark.read.csv('../created_csv/middle/middle_Developer-back-end_productive_to_job_percentages_df.csv')
middle_Developer_front_end_df=spark.read.csv('../created_csv/middle/middle_Developer-front-end_productive_to_job_percentages_df.csv')
middle_Developer_full_stack_df=spark.read.csv('../created_csv/middle/middle_Developer-full-stack_productive_to_job_percentages_df.csv')
middle_Developer_mobile_df=spark.read.csv('../created_csv/middle/middle_Developer-mobile_productive_to_job_percentages_df.csv')
middle_DevOps_specialist_df=spark.read.csv('../created_csv/middle/middle_DevOps-specialist_productive_to_job_percentages_df.csv')
middle_UX_UI_Designer_df=spark.read.csv('../created_csv/middle/middle_UX-UI-Designer_productive_to_job_percentages_df.csv')

senior_data_business_analyst_df=spark.read.csv('../created_csv/senior/senior_Data-or-business-analyst_productive_to_job_percentages_df.csv')
senior_database_administrator_df=spark.read.csv('../created_csv/senior/senior_Database-administrator_productive_to_job_percentages_df.csv')
senior_developer_advocate_df=spark.read.csv('../created_csv/senior/senior_Developer-Advocate_productive_to_job_percentages_df.csv')
senior_Developer_back_end_df=spark.read.csv('../created_csv/senior/senior_Developer-back-end_productive_to_job_percentages_df.csv')
senior_Developer_front_end_df=spark.read.csv('../created_csv/senior/senior_Developer-front-end_productive_to_job_percentages_df.csv')
senior_Developer_full_stack_df=spark.read.csv('../created_csv/senior/senior_Developer-full-stack_productive_to_job_percentages_df.csv')
senior_Developer_mobile_df=spark.read.csv('../created_csv/senior/senior_Developer-mobile_productive_to_job_percentages_df.csv')
senior_DevOps_specialist_df=spark.read.csv('../created_csv/senior/senior_DevOps-specialist_productive_to_job_percentages_df.csv')
senior_UX_UI_Designer_df=spark.read.csv('../created_csv/senior/senior_UX-UI-Designer_productive_to_job_percentages_df.csv')

transform_and_save_data(junior_data_business_analyst_df, 'junior', 'junior_Data-or-business-analyst_productive_to_job_percentages_df')
transform_and_save_data(junior_database_administrator_df, 'junior', 'junior_Database-administrator_productive_to_job_percentages_df')
transform_and_save_data(junior_developer_advocate_df, 'junior', 'junior_Developer-Advocate_productive_to_job_percentages_df')
transform_and_save_data(junior_Developer_back_end_df, 'junior', 'junior_Developer-back-end_productive_to_job_percentages_df')
transform_and_save_data(junior_Developer_front_end_df, 'junior', 'junior_Developer-front-end_productive_to_job_percentages_df')
transform_and_save_data(junior_Developer_full_stack_df, 'junior', 'junior_Developer-full-stack_productive_to_job_percentages_df')
transform_and_save_data(junior_Developer_mobile_df, 'junior', 'junior_Developer-mobile_productive_to_job_percentages_df')
transform_and_save_data(junior_DevOps_specialist_df, 'junior', 'junior_DevOps-specialist_productive_to_job_percentages_df')
transform_and_save_data(junior_UX_UI_Designer_df, 'junior', 'junior_UX-UI-Designer_productive_to_job_percentages_df')

transform_and_save_data(middle_data_business_analyst_df, 'middle', 'middle_Data-or-business-analyst_productive_to_job_percentages_df')
transform_and_save_data(middle_database_administrator_df, 'middle', 'middle_Database-administrator_productive_to_job_percentages_df')
transform_and_save_data(middle_developer_advocate_df, 'middle', 'middle_Developer-Advocate_productive_to_job_percentages_df')
transform_and_save_data(middle_Developer_back_end_df, 'middle', 'middle_Developer-back-end_productive_to_job_percentages_df')
transform_and_save_data(middle_Developer_front_end_df, 'middle', 'middle_Developer-front-end_productive_to_job_percentages_df')
transform_and_save_data(middle_Developer_full_stack_df, 'middle', 'middle_Developer-full-stack_productive_to_job_percentages_df')
transform_and_save_data(middle_Developer_mobile_df, 'middle', 'middle_Developer-mobile_productive_to_job_percentages_df')
transform_and_save_data(middle_DevOps_specialist_df, 'middle', 'middle_DevOps-specialist_productive_to_job_percentages_df')
transform_and_save_data(middle_UX_UI_Designer_df, 'middle', 'middle_UX-UI-Designer_productive_to_job_percentages_df')

transform_and_save_data(senior_data_business_analyst_df, 'senior', 'senior_Data-or-business-analyst_productive_to_job_percentages_df')
transform_and_save_data(senior_database_administrator_df, 'senior', 'senior_Database-administrator_productive_to_job_percentages_df')
transform_and_save_data(senior_developer_advocate_df, 'senior', 'senior_Developer-Advocate_productive_to_job_percentages_df')
transform_and_save_data(senior_Developer_back_end_df, 'senior', 'senior_Developer-back-end_productive_to_job_percentages_df')
transform_and_save_data(senior_Developer_front_end_df, 'senior', 'senior_Developer-front-end_productive_to_job_percentages_df')
transform_and_save_data(senior_Developer_full_stack_df, 'senior', 'senior_Developer-full-stack_productive_to_job_percentages_df')
transform_and_save_data(senior_Developer_mobile_df, 'senior', 'senior_Developer-mobile_productive_to_job_percentages_df')
transform_and_save_data(senior_DevOps_specialist_df, 'senior', 'senior_DevOps-specialist_productive_to_job_percentages_df')
transform_and_save_data(senior_UX_UI_Designer_df, 'senior', 'senior_UX-UI-Designer_productive_to_job_percentages_df')

24/06/16 13:03:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

### learn_time

In [12]:
junior_data_business_analyst_df=spark.read.csv('../created_csv/junior/junior_Data-or-business-analyst_learn_time_percentages_df.csv')
junior_database_administrator_df=spark.read.csv('../created_csv/junior/junior_Database-administrator_learn_time_percentages_df.csv')
junior_developer_advocate_df=spark.read.csv('../created_csv/junior/junior_Developer-Advocate_learn_time_percentages_df.csv')
junior_Developer_back_end_df=spark.read.csv('../created_csv/junior/junior_Developer-back-end_learn_time_percentages_df.csv')
junior_Developer_front_end_df=spark.read.csv('../created_csv/junior/junior_Developer-front-end_learn_time_percentages_df.csv')
junior_Developer_full_stack_df=spark.read.csv('../created_csv/junior/junior_Developer-full-stack_learn_time_percentages_df.csv')
junior_Developer_mobile_df=spark.read.csv('../created_csv/junior/junior_Developer-mobile_learn_time_percentages_df.csv')
junior_DevOps_specialist_df=spark.read.csv('../created_csv/junior/junior_DevOps-specialist_learn_time_percentages_df.csv')
junior_UX_UI_Designer_df=spark.read.csv('../created_csv/junior/junior_UX-UI-Designer_learn_time_percentages_df.csv')

senior_data_business_analyst_df=spark.read.csv('../created_csv/senior/senior_Data-or-business-analyst_learn_time_percentages_df.csv')
senior_database_administrator_df=spark.read.csv('../created_csv/senior/senior_Database-administrator_learn_time_percentages_df.csv')
senior_developer_advocate_df=spark.read.csv('../created_csv/senior/senior_Developer-Advocate_learn_time_percentages_df.csv')
senior_Developer_back_end_df=spark.read.csv('../created_csv/senior/senior_Developer-back-end_learn_time_percentages_df.csv')
senior_Developer_front_end_df=spark.read.csv('../created_csv/senior/senior_Developer-front-end_learn_time_percentages_df.csv')
senior_Developer_full_stack_df=spark.read.csv('../created_csv/senior/senior_Developer-full-stack_learn_time_percentages_df.csv')
senior_Developer_mobile_df=spark.read.csv('../created_csv/senior/senior_Developer-mobile_learn_time_percentages_df.csv')
senior_DevOps_specialist_df=spark.read.csv('../created_csv/senior/senior_DevOps-specialist_learn_time_percentages_df.csv')
senior_UX_UI_Designer_df=spark.read.csv('../created_csv/senior/senior_UX-UI-Designer_learn_time_percentages_df.csv')

middle_data_business_analyst_df=spark.read.csv('../created_csv/middle/middle_Data-or-business-analyst_learn_time_percentages_df.csv')
middle_database_administrator_df=spark.read.csv('../created_csv/middle/middle_Database-administrator_learn_time_percentages_df.csv')
middle_developer_advocate_df=spark.read.csv('../created_csv/middle/middle_Developer-Advocate_learn_time_percentages_df.csv')
middle_Developer_back_end_df=spark.read.csv('../created_csv/middle/middle_Developer-back-end_learn_time_percentages_df.csv')
middle_Developer_front_end_df=spark.read.csv('../created_csv/middle/middle_Developer-front-end_learn_time_percentages_df.csv')
middle_Developer_full_stack_df=spark.read.csv('../created_csv/middle/middle_Developer-full-stack_learn_time_percentages_df.csv')
middle_Developer_mobile_df=spark.read.csv('../created_csv/middle/middle_Developer-mobile_learn_time_percentages_df.csv')
middle_DevOps_specialist_df=spark.read.csv('../created_csv/middle/middle_DevOps-specialist_learn_time_percentages_df.csv')
middle_UX_UI_Designer_df=spark.read.csv('../created_csv/middle/middle_UX-UI-Designer_learn_time_percentages_df.csv')

transform_and_save_data(junior_data_business_analyst_df, 'junior', 'junior_Data-or-business-analyst_learn_time_percentages_df')
transform_and_save_data(junior_database_administrator_df, 'junior', 'junior_Database-administrator_productive_to_job_percentages_df')
transform_and_save_data(junior_developer_advocate_df, 'junior', 'junior_Developer-Advocate_learn_time_percentages_df')
transform_and_save_data(junior_Developer_back_end_df, 'junior', 'junior_Developer-back-end_learn_time_percentages_df')
transform_and_save_data(junior_Developer_front_end_df, 'junior', 'junior_Developer-front-end_learn_time_percentages_df')
transform_and_save_data(junior_Developer_full_stack_df, 'junior', 'junior_Developer-full-stack_learn_time_percentages_df')
transform_and_save_data(junior_Developer_mobile_df, 'junior', 'junior_Developer-mobilelearn_timeb_percentages_df')
transform_and_save_data(junior_DevOps_specialist_df, 'junior', 'junior_DevOps-specialist_learn_time_percentages_df')
transform_and_save_data(junior_UX_UI_Designer_df, 'junior', 'junior_UX-UI-Designer_learn_time_percentages_df')

transform_and_save_data(middle_data_business_analyst_df, 'middle', 'middle_Data-or-business-analyst_learn_time_percentages_df')
transform_and_save_data(middle_database_administrator_df, 'middle', 'middle_Database-administrator_productive_to_job_percentages_df')
transform_and_save_data(middle_developer_advocate_df, 'middle', 'middle_Developer-Advocate_learn_time_percentages_df')
transform_and_save_data(middle_Developer_back_end_df, 'middle', 'middle_Developer-back-end_learn_time_percentages_df')
transform_and_save_data(middle_Developer_front_end_df, 'middle', 'middle_Developer-front-end_learn_time_percentages_df')
transform_and_save_data(middle_Developer_full_stack_df, 'middle', 'middle_Developer-full-stack_learn_time_percentages_df')
transform_and_save_data(middle_Developer_mobile_df, 'middle', 'middle_Developer-mobilelearn_timeb_percentages_df')
transform_and_save_data(middle_DevOps_specialist_df, 'middle', 'middle_DevOps-specialist_learn_time_percentages_df')
transform_and_save_data(middle_UX_UI_Designer_df, 'middle', 'middle_UX-UI-Designer_learn_time_percentages_df')

transform_and_save_data(senior_data_business_analyst_df, 'senior', 'seniorr_Data-or-business-analyst_learn_time_percentages_df')
transform_and_save_data(senior_database_administrator_df, 'senior', 'seniorr_Database-administrator_productive_to_job_percentages_df')
transform_and_save_data(senior_developer_advocate_df, 'senior', 'seniorr_Developer-Advocate_learn_time_percentages_df')
transform_and_save_data(senior_Developer_back_end_df, 'senior', 'seniorr_Developer-back-end_learn_time_percentages_df')
transform_and_save_data(senior_Developer_front_end_df, 'senior', 'seniorr_Developer-front-end_learn_time_percentages_df')
transform_and_save_data(senior_Developer_full_stack_df, 'senior', 'seniorr_Developer-full-stack_learn_time_percentages_df')
transform_and_save_data(senior_Developer_mobile_df, 'senior', 'seniorr_Developer-mobilelearn_timeb_percentages_df')
transform_and_save_data(senior_DevOps_specialist_df, 'senior', 'seniorr_DevOps-specialist_learn_time_percentages_df')
transform_and_save_data(senior_UX_UI_Designer_df, 'senior', 'seniorr_UX-UI-Designer_learn_time_percentages_df')

24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1

24/06/16 13:03:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 13:03:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/16 1