In [1]:
import pandas as pd
import numpy as np
import collections
import catboost as cb

## This is a notebook I created working along to Matt Harrison's Effect Pandas chapter 29

In [2]:
def get_uniq_cols(jb):
    counter = collections.defaultdict(list)
    for col in sorted(jb.columns):
        period_count = col.count('.')
        if period_count >= 2:
            part_end = 2
        else:
            part_end = 1
        parts = col.split('.')[:part_end]
        counter['.'.join(parts)].append(col)
    uniq_cols = []
    for cols in counter.values():
        if len(cols) == 1:
            uniq_cols.extend(cols)
    return uniq_cols

In [3]:
def prep_for_ml(df):
    # remove pandas types
    return (df
     .assign(**{col:df[col].astype(float) 
               for col in df.select_dtypes('number')},
             **{col:df[col].astype(str).fillna('') 
               for col in df.select_dtypes(['object', 'category'])})
    )

In [4]:
def predict_col(df, col):
    df = prep_for_ml(df)
    missing = df.query(f'~{col}.isna()')
    cat_idx = []
    for i,typ in enumerate(df.drop(columns=[col]).dtypes):
        if str(typ) == 'object':
            cat_idx.append(i)
    X = (missing
         .drop(columns=[col])
         .values
        )
    y = missing[col]
    model = cb.CatBoostRegressor(iterations=20, cat_features=cat_idx)
    model.fit(X, y, cat_features=cat_idx)
    pred = model.predict(df.drop(columns=[col]))
    return df[col].where(~df[col].isna(), pred)

In [5]:
def tweak_jb(jb):
     uniq_cols = get_uniq_cols(jb)
     return (jb
         [uniq_cols]
         .rename(columns=lambda c: c.replace('.', '_'))
         .assign(age=lambda df_:df_.age.str.slice(0,2).astype(float)
                    .astype('Int64'),
                are_you_datascientist=lambda df_:df_
                    .are_you_datascientist
                    .replace({'Yes': True, 'No': False, np.nan: False}),
                company_size=lambda df_:df_.company_size.replace({
                    'Just me': 1, 'Not sure': np.nan,
                    'More than 5,000': 5000, '2–10': 2, '11–50':11,
                    '51–500': 51, '501–1,000':501,
                    '1,001–5,000':1001}).astype('Int64'),
                country_live=lambda df_:df_.country_live
                    .astype('category'),
                employment_status=lambda df_:df_.employment_status
                     .fillna('Other').astype('category'),
                is_python_main=lambda df_:df_.is_python_main
                     .astype('category'),
                team_size=lambda df_:df_.team_size
                    .str.split(r'-', n=1, expand=True)
                    .iloc[:,0].replace('More than 40 people', 41)
                    .where(df_.company_size!=1, 1).astype(float),
                years_of_coding=lambda df_:df_.years_of_coding
                    .replace('Less than 1 year', .5)
                    .str.extract(r'(\d+)').astype(float),
                python_years=lambda df_:df_.python_years
                    .replace('Less than 1 year', .5)
                    .str.extract(r'(\d+)').astype(float),
                python3_ver=lambda df_:df_.python3_version_most
                     .str.replace('_', '.').str.extract(r'(\d\.\d)')
                     .astype(float),
                use_python_most=lambda df_:df_.use_python_most
                     .fillna('Unknown')
               )
        .assign(team_size=lambda df_:predict_col(df_, 'team_size')
             .astype(int))
        .drop(columns=['python2_version_most'])
        .dropna()
    )    
url = 'https://github.com/mattharrison/datasets/raw/master/data/'\
   '2020-jetbrains-python-survey.csv'
jb = pd.read_csv(url)
jb2 = tweak_jb(jb)

  jb = pd.read_csv(url)


Learning rate set to 0.5
0:	learn: 2.9695218	total: 92.7ms	remaining: 1.76s
1:	learn: 2.8766539	total: 130ms	remaining: 1.17s
2:	learn: 2.8387189	total: 168ms	remaining: 953ms
3:	learn: 2.8028751	total: 202ms	remaining: 808ms
4:	learn: 2.7899957	total: 234ms	remaining: 703ms
5:	learn: 2.7749439	total: 266ms	remaining: 621ms
6:	learn: 2.7719128	total: 294ms	remaining: 547ms
7:	learn: 2.7649792	total: 325ms	remaining: 488ms
8:	learn: 2.7649588	total: 350ms	remaining: 427ms
9:	learn: 2.7630617	total: 379ms	remaining: 379ms
10:	learn: 2.7625779	total: 410ms	remaining: 335ms
11:	learn: 2.7515902	total: 440ms	remaining: 293ms
12:	learn: 2.7513459	total: 470ms	remaining: 253ms
13:	learn: 2.7445634	total: 499ms	remaining: 214ms
14:	learn: 2.7443257	total: 527ms	remaining: 176ms
15:	learn: 2.7423142	total: 556ms	remaining: 139ms
16:	learn: 2.7419144	total: 585ms	remaining: 103ms
17:	learn: 2.7399388	total: 616ms	remaining: 68.5ms
18:	learn: 2.7384297	total: 647ms	remaining: 34ms
19:	learn: 2.73

In [6]:
pd.crosstab(index=jb2.country_live, columns=jb2.age)

age,18,21,30,40,50,60
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Algeria,2,7,5,3,0,1
Argentina,1,38,44,20,5,1
Armenia,1,13,3,0,0,0
Australia,4,58,110,63,30,9
Austria,1,31,62,22,12,0
...,...,...,...,...,...,...
United States,40,753,1042,478,264,120
Uruguay,0,6,13,1,0,0
Uzbekistan,0,4,0,0,0,0
Venezuela,1,10,4,5,2,0


In [7]:
pd.crosstab(index=jb2.country_live, columns=jb2.age, margins=True)

age,18,21,30,40,50,60,All
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Algeria,2,7,5,3,0,1,18
Argentina,1,38,44,20,5,1,109
Armenia,1,13,3,0,0,0,17
Australia,4,58,110,63,30,9,274
Austria,1,31,62,22,12,0,128
...,...,...,...,...,...,...,...
Uruguay,0,6,13,1,0,0,20
Uzbekistan,0,4,0,0,0,0,4
Venezuela,1,10,4,5,2,0,22
Viet Nam,1,26,4,1,0,1,33


In [8]:
pd.crosstab(index=jb2.country_live, columns=jb2.age, normalize=True)

age,18,21,30,40,50,60
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Algeria,0.000146,0.000511,0.000365,0.000219,0.000000,0.000073
Argentina,0.000073,0.002771,0.003209,0.001459,0.000365,0.000073
Armenia,0.000073,0.000948,0.000219,0.000000,0.000000,0.000000
Australia,0.000292,0.004230,0.008023,0.004595,0.002188,0.000656
Austria,0.000073,0.002261,0.004522,0.001605,0.000875,0.000000
...,...,...,...,...,...,...
United States,0.002917,0.054919,0.075997,0.034863,0.019255,0.008752
Uruguay,0.000000,0.000438,0.000948,0.000073,0.000000,0.000000
Uzbekistan,0.000000,0.000292,0.000000,0.000000,0.000000,0.000000
Venezuela,0.000073,0.000729,0.000292,0.000365,0.000146,0.000000


In [9]:
pd.crosstab(index=jb2.country_live, columns=jb2.age, normalize='columns')

age,18,21,30,40,50,60
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Algeria,0.006349,0.001328,0.000989,0.001479,0.000000,0.004505
Argentina,0.003175,0.007211,0.008706,0.009862,0.006083,0.004505
Armenia,0.003175,0.002467,0.000594,0.000000,0.000000,0.000000
Australia,0.012698,0.011006,0.021765,0.031065,0.036496,0.040541
Austria,0.003175,0.005882,0.012268,0.010848,0.014599,0.000000
...,...,...,...,...,...,...
United States,0.126984,0.142884,0.206173,0.235700,0.321168,0.540541
Uruguay,0.000000,0.001139,0.002572,0.000493,0.000000,0.000000
Uzbekistan,0.000000,0.000759,0.000000,0.000000,0.000000,0.000000
Venezuela,0.003175,0.001898,0.000791,0.002465,0.002433,0.000000


In [10]:
pd.crosstab(index=jb2.country_live, columns=jb2.age, normalize='index')

age,18,21,30,40,50,60
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Algeria,0.111111,0.388889,0.277778,0.166667,0.000000,0.055556
Argentina,0.009174,0.348624,0.403670,0.183486,0.045872,0.009174
Armenia,0.058824,0.764706,0.176471,0.000000,0.000000,0.000000
Australia,0.014599,0.211679,0.401460,0.229927,0.109489,0.032847
Austria,0.007812,0.242188,0.484375,0.171875,0.093750,0.000000
...,...,...,...,...,...,...
United States,0.014831,0.279199,0.386355,0.177234,0.097887,0.044494
Uruguay,0.000000,0.300000,0.650000,0.050000,0.000000,0.000000
Uzbekistan,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
Venezuela,0.045455,0.454545,0.181818,0.227273,0.090909,0.000000


In [11]:
(pd.crosstab(index=[jb2.country_live, jb2.age],
            columns=[jb2.use_python_most, jb2.python3_version_most])
.loc[['United States']]
)

Unnamed: 0_level_0,use_python_most,Computer graphics,Computer graphics,Computer graphics,Computer graphics,Computer graphics,Data analysis,Data analysis,Data analysis,Data analysis,Data analysis,...,Unknown,Unknown,Unknown,Unknown,Unknown,Web development,Web development,Web development,Web development,Web development
Unnamed: 0_level_1,python3_version_most,Python 3_5 or lower,Python 3_6,Python 3_7,Python 3_8,Python 3_9,Python 3_5 or lower,Python 3_6,Python 3_7,Python 3_8,Python 3_9,...,Python 3_5 or lower,Python 3_6,Python 3_7,Python 3_8,Python 3_9,Python 3_5 or lower,Python 3_6,Python 3_7,Python 3_8,Python 3_9
country_live,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
United States,18,0,0,0,0,0,0,0,1,5,0,...,0,1,0,1,0,0,0,1,4,0
United States,21,0,0,1,1,1,1,18,48,64,11,...,0,10,13,11,4,3,28,54,81,4
United States,30,0,0,0,1,0,3,29,66,90,12,...,0,7,19,14,4,3,60,77,129,14
United States,40,0,0,0,3,0,0,14,30,45,3,...,2,5,5,2,4,1,14,26,66,8
United States,50,0,0,1,2,0,2,6,26,36,1,...,1,2,3,6,0,0,10,12,14,2
United States,60,0,0,1,0,0,0,1,11,11,1,...,0,4,2,3,0,0,3,5,5,1


In [12]:
(pd.crosstab(index=[jb2.country_live, jb2.age],
            columns=[jb2.use_python_most, jb2.python3_version_most])
.loc[['United States'], ['Data analysis', 'Web development']]
)

Unnamed: 0_level_0,use_python_most,Data analysis,Data analysis,Data analysis,Data analysis,Data analysis,Web development,Web development,Web development,Web development,Web development
Unnamed: 0_level_1,python3_version_most,Python 3_5 or lower,Python 3_6,Python 3_7,Python 3_8,Python 3_9,Python 3_5 or lower,Python 3_6,Python 3_7,Python 3_8,Python 3_9
country_live,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
United States,18,0,0,1,5,0,0,0,1,4,0
United States,21,1,18,48,64,11,3,28,54,81,4
United States,30,3,29,66,90,12,3,60,77,129,14
United States,40,0,14,30,45,3,1,14,26,66,8
United States,50,2,6,26,36,1,0,10,12,14,2
United States,60,0,1,11,11,1,0,3,5,5,1


In [13]:
(pd.crosstab(index=[jb2.country_live, jb2.age],
            columns=[jb2.use_python_most, jb2.python3_version_most])
.loc[['United States'], ['Data analysis', 'Web development']]
.style.background_gradient(cmap='viridis', axis=None)
)

Unnamed: 0_level_0,use_python_most,Data analysis,Data analysis,Data analysis,Data analysis,Data analysis,Web development,Web development,Web development,Web development,Web development
Unnamed: 0_level_1,python3_version_most,Python 3_5 or lower,Python 3_6,Python 3_7,Python 3_8,Python 3_9,Python 3_5 or lower,Python 3_6,Python 3_7,Python 3_8,Python 3_9
country_live,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
United States,18,0,0,1,5,0,0,0,1,4,0
United States,21,1,18,48,64,11,3,28,54,81,4
United States,30,3,29,66,90,12,3,60,77,129,14
United States,40,0,14,30,45,3,1,14,26,66,8
United States,50,2,6,26,36,1,0,10,12,14,2
United States,60,0,1,11,11,1,0,3,5,5,1
