-  pandas does not scale well to big data. It was designed for small data sets that a single machine could handle
- koalas is pandas of apache spark with same syntex, it can easily scale to multiple machines
- https://koalas.readthedocs.io/en/latest/

In [1]:
import databricks.koalas as ks

In [2]:
koalas_df = ks.read_parquet('userdata1.parquet')

In [3]:
koalas_df.describe()

Unnamed: 0,id,salary
count,1000.0,932.0
mean,500.5,149005.356652
std,288.819436,79785.176736
min,1.0,12380.49
25%,250.0,81503.73
50%,500.0,146980.49
75%,750.0,220086.62
max,1000.0,286592.99


In [4]:
koalas_df.head()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
0,2016-02-03 13:25:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0
1,2016-02-03 22:34:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,
2,2016-02-03 06:39:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,2/1/1960,144972.51,Structural Engineer,
3,2016-02-03 06:06:21,4,Denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,4/8/1997,90263.05,Senior Cost Accountant,
4,2016-02-03 10:35:31,5,Carlos,Burns,cburns4@miitbeian.gov.cn,,169.113.235.40,5602256255204850.0,South Africa,,,,


In [5]:
koalas_df.gender.value_counts()

Female    482
Male      451
           67
Name: gender, dtype: int64

In [6]:
koalas_df.country.value_counts().sort_values(ascending=False)

China                               189
Indonesia                            97
Russia                               62
Philippines                          45
Brazil                               38
Portugal                             38
France                               37
Poland                               35
Sweden                               25
Japan                                20
Canada                               19
United States                        17
Ukraine                              16
Peru                                 14
Colombia                             14
Argentina                            13
Thailand                             12
Czech Republic                       12
Finland                              11
Nigeria                              11
Mexico                               11
South Africa                          9
Malaysia                              8
Bosnia and Herzegovina                8
Kazakhstan                            7


In [7]:
type(koalas_df.gender)

databricks.koalas.series.Series

In [8]:
koalas_df.title.unique()

0                  Systems Administrator II
1                         Media Manager III
2                        Recruiting Manager
3                              Geologist II
4                             Geologist III
5                 Database Administrator IV
6                         Financial Analyst
7                        Analyst Programmer
8                             Accountant IV
9                      Software Engineer II
10              Computer Systems Analyst IV
11                         Product Engineer
12                Software Test Engineer II
13                         Junior Executive
14                Systems Administrator III
15                             VP Marketing
16            Human Resources Assistant III
17                       Environmental Tech
18              Mechanical Systems Engineer
19                   Nuclear Power Engineer
20                      Assistant Professor
21              Information Systems Manager
22                      Executiv

In [9]:
koalas_df.isnull().sum()

registration_dttm     0
id                    0
first_name            0
last_name             0
email                 0
gender                0
ip_address            0
cc                    0
country               0
birthdate             0
salary               68
title                 0
comments              6
dtype: int64

In [10]:
koalas_df[koalas_df.salary.isnull()].head(5)

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
4,2016-02-03 10:35:31,5,Carlos,Burns,cburns4@miitbeian.gov.cn,,169.113.235.40,5602256255204850,South Africa,,,,
70,2016-02-04 03:46:45,71,Stephanie,Watkins,swatkins1y@rakuten.co.jp,,124.183.29.113,30552863095190,Burkina Faso,8/29/1971,,Physical Therapy Assistant,
104,2016-02-03 22:35:17,105,John,Stone,jstone2w@bbb.org,,116.149.171.213,3564400883560314,Bulgaria,4/28/1972,,Quality Engineer,
108,2016-02-04 00:47:47,109,Gloria,Hamilton,ghamilton30@webmd.com,,73.114.61.187,30159872455108,Canada,3/9/1988,,Systems Administrator IV,
119,2016-02-03 19:48:24,120,Kelly,Fuller,kfuller3b@webeden.co.uk,,104.13.230.181,6709741313285577939,China,2/27/1990,,Biostatistician I,


In [11]:
koalas_df['first_name'] = koalas_df.first_name.apply(lambda x: x.lower())

In [21]:
koalas_df.dtypes

registration_dttm    datetime64[ns]
id                            int32
first_name                   object
last_name                    object
email                        object
gender                       object
ip_address                   object
cc                           object
country                      object
birthdate                    object
salary                      float64
title                        object
comments                     object
dtype: object

In [36]:
gender_onehot = ks.get_dummies(koalas_df.gender,drop_first=True,prefix='Gender')

In [39]:
koalas_df.shape

(1000, 13)

In [56]:
gender_onehot.head(2)

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,0,1


In [50]:
koalas_df.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            990, 991, 992, 993, 994, 995, 996, 997, 998, 999],
           dtype='int64', length=1000)

In [55]:
# ks.concat([gender_onehot,koalas_df]) axis=1 is not suuported as of now

In [58]:
# Koalas disallows the operations on different DataFrames (or Series) by default to prevent expensive operations. 
# It internally performs a join operation which can be expensive in general.
ks.set_option('compute.ops_on_diff_frames', True)
koalas_df['Gender_Male'] = gender_onehot['Gender_Male']
koalas_df['Gender_Female'] = gender_onehot['Gender_Female']

In [59]:
koalas_df.head()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments,Gender_Male,Gender_Female
0,2016-02-03 13:25:29,1,amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0,0,1
1,2016-02-03 22:34:03,2,albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,,1,0
2,2016-02-03 06:39:31,3,evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,2/1/1960,144972.51,Structural Engineer,,0,1
3,2016-02-03 06:06:21,4,denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,4/8/1997,90263.05,Senior Cost Accountant,,0,1
4,2016-02-03 10:35:31,5,carlos,Burns,cburns4@miitbeian.gov.cn,,169.113.235.40,5602256255204850.0,South Africa,,,,,0,0


In [60]:
spark_df = koalas_df.to_spark()

In [61]:
spark_df.printSchema()

root
 |-- registration_dttm: timestamp (nullable = true)
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- cc: string (nullable = true)
 |-- country: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- title: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- Gender_Male: byte (nullable = true)
 |-- Gender_Female: byte (nullable = true)

