# Chapter 21

## Imports

In [2]:
# %load_ext lab_black

In [3]:
import pandas as pd
import numpy as np
import collections
import catboost as cb

## Functions

In [4]:
def get_unique_cols(jb):
    counter = collections.defaultdict(list)
    for col in sorted(jb.columns):
        period_count = col.count(".")
        if period_count >= 2:
            part_end = 2
        else:
            part_end = 1
        parts = col.split(".")[:part_end]
        counter[".".join(parts)].append(col)
    uniq_cols = []
    for cols in counter.values():
        if len(cols) == 1:
            uniq_cols.extend(cols)
    return uniq_cols

In [5]:
def prep_for_ml(df):
    # remove pandas types
    return df.assign(
        **{col: df[col].astype(float) for col in df.select_dtypes("number")},
        **{
            col: df[col].astype(str).fillna("")
            for col in df.select_dtypes(["object", "category"])
        }
    )

In [6]:
def predict_col(df, col):
    df = prep_for_ml(df)
    missing = df.query(f"~{col}.isna()")
    cat_idx = [
        i for i, typ in enumerate(df.drop(columns=[col]).dtypes) if str(typ) == "object"
    ]
    X = missing.drop(columns=[col]).values
    y = missing[col]
    model = cb.CatBoostRegressor(iterations=20, cat_features=cat_idx)
    model.fit(X, y, cat_features=cat_idx)
    pred = model.predict(df.drop(columns=[col]))
    return df[col].where(~df[col].isna(), pred)

In [7]:
def tweak_jb(jb):
    # for n-dash instead of hypen, need to use Numlock > Alt + 0 1 5 0
    uniq_cols = get_unique_cols(jb)
    return (
        jb[uniq_cols]
        .rename(columns=lambda c: c.replace(".", "_"))
        .assign(
            age=lambda df_: df_.age.str[0:2].astype("Int64"),
            are_you_datascientist=lambda df_: df_.are_you_datascientist.replace(
                {"Yes": True, "No": False, np.nan: False}
            ),
            company_size=lambda df_: df_.company_size.replace(
                {
                    "Just me": 1,
                    "Not sure": np.nan,
                    "More than 5,000": 5000,
                    "2–10": 2,
                    "11–50": 11,
                    "51–500": 51,
                    "501–1,000": 501,
                    "1,001–5,000": 1001,
                }
            ).astype("Int64"),
            country_live=lambda df_: df_.country_live.astype("category"),
            employment_status=lambda df_: df_.employment_status.fillna("Other").astype(
                "category"
            ),
            is_python_main=lambda df_: df_.is_python_main.astype("category"),
            team_size=lambda df_: df_.team_size.str.split(r"-", n=1, expand=True)
            .iloc[:, 0]
            .replace("More than 40 people", 41)
            .where(df_.company_size != 1, 1)
            .astype(float),
            years_of_coding=lambda df_: df_.years_of_coding.replace(
                "Less than 1 year", 0.5
            )
            .str.extract(r"(\d+)")
            .astype(float),
            python_years=lambda df_: df_.python_years.replace("Less than 1 year", 0.5)
            .str.extract(r"(\d+)")
            .astype(float),
            python3_ver=lambda df_: df_.python3_version_most.str.replace("_", ".")
            .str.extract(r"(\d\.\d)")
            .astype(float),
            use_python_most=lambda df_: df_.use_python_most.fillna("Unknown"),
        )
        .assign(team_size=lambda df_: predict_col(df_, "team_size").astype(int))
        .drop(columns=["python2_version_most"])
        .dropna()
    )

## Read Raw Data

In [8]:
url = "https://github.com/mattharrison/datasets/raw/master/data/2020-jetbrains-python-survey.csv"
jb = pd.read_csv(url)

  jb = pd.read_csv(url)


## Tweak Data

In [9]:
jb2 = tweak_jb(jb)

Learning rate set to 0.5
0:	learn: 2.9695218	total: 175ms	remaining: 3.32s
1:	learn: 2.8766539	total: 207ms	remaining: 1.86s
2:	learn: 2.8387189	total: 241ms	remaining: 1.37s
3:	learn: 2.8028751	total: 274ms	remaining: 1.09s
4:	learn: 2.7899957	total: 305ms	remaining: 916ms
5:	learn: 2.7749439	total: 336ms	remaining: 784ms
6:	learn: 2.7719128	total: 366ms	remaining: 680ms
7:	learn: 2.7649792	total: 395ms	remaining: 593ms
8:	learn: 2.7649588	total: 422ms	remaining: 516ms
9:	learn: 2.7630617	total: 452ms	remaining: 452ms
10:	learn: 2.7625779	total: 484ms	remaining: 396ms
11:	learn: 2.7515902	total: 514ms	remaining: 343ms
12:	learn: 2.7513459	total: 543ms	remaining: 292ms
13:	learn: 2.7445634	total: 572ms	remaining: 245ms
14:	learn: 2.7443257	total: 600ms	remaining: 200ms
15:	learn: 2.7423142	total: 629ms	remaining: 157ms
16:	learn: 2.7419143	total: 657ms	remaining: 116ms
17:	learn: 2.7399387	total: 690ms	remaining: 76.6ms
18:	learn: 2.7384296	total: 719ms	remaining: 37.9ms
19:	learn: 2.7

In [10]:
jb2

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python3_version_most,several_projects,team_size,use_python_most,years_of_coding,python3_ver
1,21,True,5000,India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",2,Software prototyping,3.0,3.6
2,30,False,5000,United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",4,DevOps / System administration / Writing autom...,3.0,3.6
10,21,False,51,Other country,Fully employed by a company / organization,School / University,Daily,IntelliJ IDEA,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,1.0,Python 3_8,"Yes, I work on one main and several side projects",2,Web development,1.0,3.8
11,21,True,51,United States,Fully employed by a company / organization,Online learning platform / Online course,Daily,PyCharm Community Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",9.0,3.0,Python 3_9,"Yes, I work on many different projects",2,Data analysis,3.0,3.9
13,30,True,5000,Belgium,Fully employed by a company / organization,Social network,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_7,"Yes, I work on many different projects",2,Data analysis,3.0,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54456,30,False,1001,Turkey,Fully employed by a company / organization,Friend / Colleague,Daily,PyCharm Community Edition,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",9.0,1.0,Python 3_6,"Yes, I work on many different projects",5,Machine learning,6.0,3.6
54457,21,False,2,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_6,"Yes, I work on many different projects",2,Data analysis,1.0,3.6
54459,21,False,1,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_7,"Yes, I work on many different projects",1,Web development,6.0,3.7
54460,30,True,51,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6.0,Python 3_7,"Yes, I work on many different projects",4,Data analysis,3.0,3.7


In [13]:
jb2.index

Int64Index([    1,     2,    10,    11,    13,    14,    15,    17,    22,
               25,
            ...
            54433, 54442, 54445, 54447, 54450, 54456, 54457, 54459, 54460,
            54461],
           dtype='int64', length=13711)

In [11]:
jb2.are_you_datascientist.unique()

array([True, False, 'Other'], dtype=object)

In [12]:
# jb2.to_csv('data/jet_brains_python_2020_survey_data_cleaned.csv')