In [None]:
!pip install pyoptimus

In [None]:
!pip install py-dateinfer url_parser
!pip install -U pandas
!pip install python-libmagic

In [None]:
!mkdir /content/x
!git clone https://github.com/PacktPublishing/Data-Processing-with-Optimus.git /content/x
%cd /content

In [3]:
!mv /content/x/Chapter01/foo.txt .
!mv /content/x/Chapter01/path/to/file.csv .

In [16]:
import pandas as pd

pd.read_csv('file.csv')

Unnamed: 0,name,function
0,optimus,Leader
1,bumblebee,Espionage
2,eject,Electronic surveillance
3,megatron,NEMESIS


In [7]:
from optimus import Optimus

In [11]:
op = Optimus('pandas')

### Basics

In [15]:
# df = op.load.file('file.csv')
df = op.load.csv('file.csv')
df

name  1 (object),function  2 (object)
optimus,Leader
bumblebee,Espionage
eject,Electronic⋅surveillance
megatron,NEMESIS


In [17]:
df = df.cols.rename("function", "job")
df

name  1 (object),job  2 (object)
optimus,Leader
bumblebee,Espionage
eject,Electronic⋅surveillance
megatron,NEMESIS


In [18]:
df = df.cols.upper("name").cols.lower("job")
df

name  1 (object),job  2 (object)
OPTIMUS,leader
BUMBLEBEE,espionage
EJECT,electronic⋅surveillance
MEGATRON,nemesis


In [19]:
df.cols.drop("name") 

job  1 (object)
leader
espionage
electronic⋅surveillance
nemesis


In [21]:
df.rows.drop(df["name"]=="MEGATRON") 

name  1 (object),job  2 (object)
OPTIMUS,leader
BUMBLEBEE,espionage
EJECT,electronic⋅surveillance


In [22]:
df.display()

name  1 (object),job  2 (object)
OPTIMUS,leader
BUMBLEBEE,espionage
EJECT,electronic⋅surveillance
MEGATRON,nemesis


In [25]:
df.cols.capitalize("name", output_cols="cap_name") 

name  1 (object),cap_name  2 (object),job  3 (object)
OPTIMUS,Optimus,leader
BUMBLEBEE,Bumblebee,espionage
EJECT,Eject,electronic⋅surveillance
MEGATRON,Megatron,nemesis


In [26]:
df.profile(bins=10) 

{'columns': {'name': {'stats': {'match': 4,
    'missing': 0,
    'mismatch': 0,
    'inferred_type': {'data_type': 'str', 'categorical': True},
    'frequency': [{'value': 'BUMBLEBEE', 'count': 1},
     {'value': 'EJECT', 'count': 1},
     {'value': 'MEGATRON', 'count': 1},
     {'value': 'OPTIMUS', 'count': 1}],
    'count_uniques': 4},
   'data_type': 'object'},
  'job': {'stats': {'match': 4,
    'missing': 0,
    'mismatch': 0,
    'inferred_type': {'data_type': 'str', 'categorical': True},
    'frequency': [{'value': 'electronic surveillance', 'count': 1},
     {'value': 'espionage', 'count': 1},
     {'value': 'leader', 'count': 1},
     {'value': 'nemesis', 'count': 1}],
    'count_uniques': 4},
   'data_type': 'object'}},
 'name': 'file.csv',
 'file_name': ['file.csv'],
 'summary': {'cols_count': 2,
  'rows_count': 4,
  'data_types_list': ['object'],
  'total_count_data_types': 1,
  'missing_count': 0,
  'p_missing': 0.0}}

In [23]:
dfn = op.create.dataframe({"A":["1",2,"4","!",None]})
dfn

A  1 (object)
1
2
4
!
""


In [24]:
dfn.cols.min("A"), dfn.cols.max("A")

(1.0, 4.0)

In [27]:
df = op.create.dataframe({
    "A":["1",2,"4","!",None],
    "B":["Optimus","Bumblebee", "Eject", None, None]
})  

df.profile(bins=10) 

{'columns': {'A': {'stats': {'match': 0,
    'missing': 1,
    'mismatch': 4,
    'inferred_type': {'data_type': 'int', 'categorical': True},
    'frequency': [{'value': 2, 'count': 1},
     {'value': '!', 'count': 1},
     {'value': '1', 'count': 1},
     {'value': '4', 'count': 1}],
    'count_uniques': 4},
   'data_type': 'object'},
  'B': {'stats': {'match': 3,
    'missing': 2,
    'mismatch': 0,
    'inferred_type': {'data_type': 'str', 'categorical': True},
    'frequency': [{'value': 'Bumblebee', 'count': 1},
     {'value': 'Eject', 'count': 1},
     {'value': 'Optimus', 'count': 1}],
    'count_uniques': 3},
   'data_type': 'object'}},
 'name': None,
 'file_name': None,
 'summary': {'cols_count': 2,
  'rows_count': 5,
  'data_types_list': ['object'],
  'total_count_data_types': 1,
  'missing_count': 0,
  'p_missing': 0.0}}

In [28]:
df.columns_sample("*") 

{'columns': [{'title': 'A'}, {'title': 'B'}],
 'value': [['1', 'Optimus'],
  [2, 'Bumblebee'],
  ['4', 'Eject'],
  ['!', None],
  [None, None]]}

In [29]:
df.execute()

A  1 (object),B  2 (object)
1,Optimus
2,Bumblebee
4,Eject
!,
,


In [30]:
df = op.load.csv("foo.txt", sep=",") 
type(df.data)

pandas.core.frame.DataFrame

In [31]:
df

name  1 (object),function  2 (object)
Optimus,⋅leader
Bumblebee,⋅espionage
eject,⋅ELECTRONIC⋅SURVEILLANCE


In [32]:
df = df.cols.upper("*") 
df.meta["transformations"] 

{'actions': [{'name': 'upper', 'columns': 'name'},
  {'name': 'upper', 'columns': 'function'},
  {'name': 'set', 'columns': 'name'},
  {'name': 'set', 'columns': 'function'}]}

### Read some rows from parquet

Pandas still doesn't support reading few of the rows, instead of full. So we can use Optimus in this case.

In [35]:
import pandas as pd

df = pd.read_csv('file.csv')
df.to_parquet('file.parquet.snappy', compression='snappy')
df

Unnamed: 0,name,function
0,optimus,Leader
1,bumblebee,Espionage
2,eject,Electronic surveillance
3,megatron,NEMESIS


In [None]:
pd.read_parquet('file.parquet.snappy', nrows=2)

In [39]:
df = op.load.parquet('file.parquet.snappy', n_rows=2)
df

'load.parquet' on Pandas loads the whole dataset and then truncates it


name  1 (object),function  2 (object)
optimus,Leader
bumblebee,Espionage


### Optimize memory

In [40]:
df = op.create.dataframe({ 
    "a": [1000,2000,3000,4000,5000]*10000, 
    "b": [1,2,3,4,5]*10000 
}) 
df.size() 

800128

In [None]:
df = df.optimize()
df.size() 