In [1]:
import pandas as pd
import numpy as np

In [2]:
# i can create dataframe from dictionary or from list of lists

In [3]:
# in case with dicts keys will become column names and values from each key will be a value in it
d = {
    'name': ['John', 'Marry'],
    'age' : ['28', '30']
}

pd.DataFrame(d)

Unnamed: 0,name,age
0,John,28
1,Marry,30


In [4]:
# in case with list of lists columns will be nameless, each list will take a row
a = [
    [1, 2, 3, 4], ['a', 'b', 'c']
]
pd.DataFrame(a)

Unnamed: 0,0,1,2,3
0,1,2,3,4.0
1,a,b,c,


In [5]:
d1 = pd.DataFrame(d)
d2 = pd.DataFrame(d)
l1 = pd.DataFrame(a)

In [6]:
# i can compare identically-labeled (both index and columns) dataframes
d1 == d2

Unnamed: 0,name,age
0,True,True
1,True,True


In [7]:
# if i want to have only answer i can use this
d1.equals(d2)

True

In [8]:
# here is the way of making random sample from df
df = pd.read_csv('data.txt', encoding='windows-1251')
df.sample(3)


Unnamed: 0,DR_Dat,DR_Tim,DR_NChk,DR_NDoc,DR_Apt,DR_Kkm,DR_TDoc,DR_TPay,DR_CDrugs,DR_NDrugs,...,DR_Prod,DR_Kol,DR_CZak,DR_CRoz,DR_SDisc,DR_CDisc,DR_BCDisc,DR_TabEmpl,DR_VZak,DR_Pos
2792,2022-08-12,10:12:23,5208,6003642,6,22568,Розничная реализация,18,260081,ТРЕКРЕЗАН 200МГ. №20 ТАБ. /ГРОТЕКС/ФАРМПРОЕКТ/...,...,ФАРМПРОЕКТ ЗАО,0.097473,616.78,833.0,8.09,35.0,200000000000.0,37,1,6.0
2168,2022-08-11,16:17:59,4753,11007004,11,23925,Розничная реализация,15,347235,"ХЛОРГЕКСИДИН 0,05% 100МЛ. №1 Р-Р Д/МЕСТ. И НАР...",...,ТУЛЬСКАЯ ФАРМ. ФАБРИКА,1.0,11.55,14.0,0.0,,,30,1,1.0
2150,2022-08-11,09:07:15,4642,11007004,11,23925,Розничная реализация,18,13414,"ИНТЕКС БИНТ ТРУБЧ. ЛАТЕКСНО-ПОЛИЭФ. 15Х3,0СМ. ...",...,ИНТЕРТЕКСТИЛЬ,1.0,11.9,20.0,0.0,,,30,1,2.0


In [9]:
# here is example of joining dataframes, it can be usefull for joining ABC and XYZ analysis 
df_abc = pd.DataFrame({
    'name': ['orange', 'banana', 'apple'],
    'abc': ['A', 'A', 'B']
})

df_xyz = pd.DataFrame({
    'name_xyz': ['lemon', 'banana', 'apple'],
    'xyz': ['X', 'X', 'Z']
})

In [None]:
# so left and right is like its written in merge
# by default merge does inner join - include only values that are present in both df, using arg 'how' i can change that
abc_xyz = df_abc.merge(df_xyz, left_on='name', right_on='name_xyz', how='outer')
# how also can be 'right', 'left'
abc_xyz

Unnamed: 0,name,abc,name_xyz,xyz
0,apple,B,apple,Z
1,banana,A,banana,X
2,,,lemon,X
3,orange,A,,


In [11]:
# here is a way to make it prettier
# i replace all NaN from name column with names from name_xyz column
abc_xyz['name'] = np.where(abc_xyz['name'].isnull(), abc_xyz['name_xyz'], abc_xyz['name'])
# then i drop name_xyz column
abc_xyz = abc_xyz.drop('name_xyz', axis=1).fillna('-')
abc_xyz

Unnamed: 0,name,abc,xyz
0,apple,B,Z
1,banana,A,X
2,lemon,-,X
3,orange,A,-


In [None]:
# i can filter dataframe using 'filter' method
# axis=1 is for columns and axis=0 is for rows
df.filter(regex=r'ad\.', axis=1) # filtering by regex
df.filter(like='ug', axis=1) # filtering by columns where 'ug' is in the name 

Unnamed: 0,DR_CDrugs,DR_NDrugs
0,45399,ЦИПРОЛЕТ 3МГ/МЛ. 5МЛ. №1 ГЛ.КАПЛИ ФЛ./КАП. /Д-...
1,261519,ПЕРЕКИСЬ ВОДОРОДА 3% 100МЛ. №40 Р-Р ФЛ.
2,460864,СОФЬЯ ГЕЛЬ Д/НОГ ВЕНОТОНИЗ. ТРОКСЕРУТИН ФОРТЕ ...
3,172823,СОФЬЯ ГХК КРЕМ Д/ТЕЛА ХОНДРОИТИН+ГЛЮКОЗАМИН 12...
4,79056,ГАЛВУС 50МГ. №28 ТАБ. /НОВАРТИС/
...,...,...
4457,463100,КЕТОРОЛ ЭКСПРЕСС 10МГ. №20 ТАБ. ДИСПЕРГ. /Д-Р ...
4458,112158,АНТИПОЛИЦАЙ ВАЙТ №6 ТАБ.
4459,260990,СИЛДЕНАФИЛ-СЗ 50МГ. №10 ТАБ. П/П/О /СЕВЕРНАЯ З...
4460,41512,ТОБРАЗОН 5МЛ. ГЛ.КАПЛИ ФЛ.


In [18]:
# here is making multiple dataframes from one
# i will split it by drug store indexes
drg_stores = df['DR_Apt'].unique().tolist() # here is the way of making list from data array

In [None]:
# this is loop for big df so it will cycle thru it and find all rows for each drug store and put it in separate variable
# this variable will be exported as xlsx file by to_excel() function
for store in drg_stores:
    small_df = df[df['DR_Apt'] == store]
    # index=False makes that there will be no index column in excel file
    small_df.to_excel(f'output/{store}.xlsx', sheet_name=f'{store}', index=False)

In [None]:
# this is how i do the opposite - join multiple small dfs in to a big one
df_1 = pd.read_excel('output/2.xlsx') # if i dont want to write every single file like this i can use glob

In [24]:
import glob

In [None]:
glob.glob('output/*.xlsx') # returns all files which correspond to the template

['output/15.xlsx',
 'output/18.xlsx',
 'output/13.xlsx',
 'output/2.xlsx',
 'output/11.xlsx',
 'output/7.xlsx',
 'output/17.xlsx',
 'output/6.xlsx']

In [26]:
# here im making a loop for reading all files and adding them to the list
files = glob.glob('output/*.xlsx')

dfs_list = []

for file in files:
    small_df = pd.read_excel(file)
    dfs_list.append(small_df)

In [None]:
len(dfs_list) # it should be the same as the number of files

8

In [30]:
# this function joins all dfs on top of each other, ignore_index is for ignoring indexes since when i read the files index was added to them
big_df = pd.concat(dfs_list, ignore_index=True)

In [33]:
big_df.size == df.size

True