# Pandas

There are two core objects in pandas: the **DataFrame** and the **Series**. A DataFrame is a table. A Series is a sequence of data values. A Series is, in essence, a single column of a DataFrame.


In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.Series([1, 2, 3, 4, 5])

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
#df = pd.read_csv("file.csv")
df = pd.DataFrame({'Yes': [50, 21, 15], 'No': [131, 2, 16]})

display(df.head())                # grabs the first five rows
print(df.shape)

display( df.Yes )                 # col  we can: df['Yes']

print(df['Yes'][0], "\n")         # element

display(df.iloc[0])                 # row  or [r,c]  or [[1,2,5], 0] 


Unnamed: 0,Yes,No
0,50,131
1,21,2
2,15,16


(3, 2)


0    50
1    21
2    15
Name: Yes, dtype: int64

50 



Yes     50
No     131
Name: 0, dtype: int64

In [4]:
df.loc[df.Yes > 20]

Unnamed: 0,Yes,No
0,50,131
1,21,2


In [5]:
df = pd.DataFrame()
print(df.index, df.columns)   # Index([], dtype='object')  Index([], dtype='object')


df = pd.DataFrame([1,3,2], columns=['A'])
print(df.index, df.columns)   # Index([], dtype='object')  Index([], dtype='object')

df = pd.DataFrame()
df['A'] = [1,2,3]
print(df.index, df.columns)   # Index([], dtype='object')  Index([], dtype='object')

df = pd.DataFrame({ "Name": ["Jon", "Mia"],
                    "Age" : [43,     39  ]}, index=['r0', 1])
df

Index([], dtype='object') Index([], dtype='object')
RangeIndex(start=0, stop=3, step=1) Index(['A'], dtype='object')
RangeIndex(start=0, stop=3, step=1) Index(['A'], dtype='object')


Unnamed: 0,Name,Age
r0,Jon,43
1,Mia,39


In [102]:
df = pd.DataFrame({'a':[1,2], 'b':[3,4]})
display(df)

df.a = df.a.map(lambda x: x - 50)
display(df)

Unnamed: 0,a,b
0,1,3
1,2,4


Unnamed: 0,a,b
0,-49,3
1,-48,4


In [8]:
df = pd.DataFrame({'a': [ 1, 2],
                   'b': [-1,-2]})
display(df)
df = df.rename(columns = {'a': 'b', 'b': 'a'})
display(df)

df.at[0, 'a']

Unnamed: 0,a,b
0,1,-1
1,2,-2


Unnamed: 0,b,a
0,1,-1
1,2,-2


-1

In [50]:
df = pd.DataFrame({'a':[1,2], 'b':[-1,-2]})    #   | a b
                                               # ---------- 
df.loc[len(df.index)] = [3, -3]                # 0 | 1 -1

def strange(x):
    return sum(x)**(0.5)
strange.__name__ = 'strange fun'

df.agg(['sum', 'mean', 'std', max, strange])


Unnamed: 0,a,b
sum,6.0,-6.00000+0.00000j
mean,2.0,-2.00000+0.00000j
std,1.0,1.00000+0.00000j
max,3.0,-1.00000+0.00000j
strange fun,2.44949,0.00000+2.44949j


## Группы

In [48]:
df = pd.DataFrame({'s': [0,0,0,1,1,2],
                   'i': [1,2,1,1,3,2],
                   't': [0,0,1,0,1,0]})


display( df.groupby('s').agg(s1=('i',set), mi=('i',min)) )

gr0 = df[df.t==0].groupby('s').agg(s1=('i',set)); display(gr0)
gr1 = df[df.t==1].groupby('s').agg(s2=('i',set)); display(gr1)

display(pd.concat([gr0, gr1], axis=1))

fun = lambda x: [i for i in x if i > 1]

display( df.groupby('s').agg(s1=('i',set), mi=('i',min), f=('i', fun) ) )

Unnamed: 0_level_0,s1,mi
s,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"{1, 2}",1
1,"{1, 3}",1
2,{2},2


Unnamed: 0_level_0,s1
s,Unnamed: 1_level_1
0,"{1, 2}"
1,{1}
2,{2}


Unnamed: 0_level_0,s2
s,Unnamed: 1_level_1
0,{1}
1,{3}


Unnamed: 0_level_0,s1,s2
s,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"{1, 2}",{1}
1,{1},{3}
2,{2},


Unnamed: 0_level_0,s1,mi,f
s,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"{1, 2}",1,[2]
1,"{1, 3}",1,[3]
2,{2},2,[2]


In [51]:
for index, row in gr0.iterrows():
    print(index, row['s1'])


0 {1, 2}
1 {1}
2 {2}


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

#create DataFrame
df = pd.DataFrame({'team':   ['A', 'A', 'B', 'B', 'B', 'B', 'C', 'C'],
                   'points': [25,  12,   15,  14,  19,  23,  25,  29]})
print(df)

onehot    = OneHotEncoder(handle_unknown='ignore')
onehot_df = pd.DataFrame( onehot.fit_transform(df[['team']]).toarray(), columns=['A','B','C'] )

final_df = df.join(onehot_df)  #merge one-hot encoded columns back with original DataFrame
print(final_df)

#onehot.fit_transform(df[['team']])

In [121]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

X, y = load_iris(return_X_y=True)
print(X[0])
print(y)

clf = LogisticRegression(random_state=0, max_iter=1000).fit(X, y)
clf.score(X,y)

[5.1 3.5 1.4 0.2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


0.9733333333333334

In [18]:
df = pd.DataFrame({
    'N': ['a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'c'],
    'V': [ 1,   2,   3,   4,   5,   6,   7,   8,   9 ],   
})


group = df.groupby('N')
 
display(group['V'].agg([len, np.sum, np.mean, np.std, set]))


#sets = group['V'].agg([set])
#for s in sets.set:
#    print(s)

#for n,t in group:
#    display(t)

Unnamed: 0_level_0,len,sum,mean,std,set
N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,2,3,1.5,0.707107,"{1, 2}"
b,3,12,4.0,1.0,"{3, 4, 5}"
c,4,30,7.5,1.290994,"{8, 9, 6, 7}"


In [20]:
df = pd.DataFrame(columns=['id1', 'id2', 'probs', 'tot'])
df.loc[len(df.index)] = [1,2,0.1, 10]
df

Unnamed: 0,id1,id2,probs,tot
0,1.0,2.0,0.1,10.0


In [26]:
df = pd.DataFrame({"date": [1659304800, 1659304904]})
df.date = pd.to_datetime(df.date, unit='s')
df

Unnamed: 0,date
0,2022-07-31 22:00:00
1,2022-07-31 22:01:44


In [75]:
print(set(df_ref.index))
print(set(df.index))
set(df_ref.index) - set(df.index) 

{0, 1, 3, 4}
{0, 1, 2, 3}


{4}

In [83]:
df_ref = pd.DataFrame({'i':[0,  1, 3, 4], 
                       'v':[10,20,30,40],
                       's':['a','b','c','d']})
df_ref.index = df_ref.i
df_ref = df_ref.drop(columns=["i"])

df = pd.DataFrame({"i":[0,0,1,3,2] })
df.index = df.i

display(df_ref)
display(df)

df = pd.concat( [df, pd.DataFrame(index=list(set(df_ref.index) - set(df.index)) ) ])
df = pd.concat([df, df_ref], axis=1)

#df = df.reset_index(drop=True) 
#df.index.name = None

df

Unnamed: 0_level_0,v,s
i,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10,a
1,20,b
3,30,c
4,40,d


Unnamed: 0_level_0,i
i,Unnamed: 1_level_1
0,0
0,0
1,1
3,3
2,2


Unnamed: 0,i,v,s
0,0.0,10.0,a
0,0.0,10.0,a
1,1.0,20.0,b
3,3.0,30.0,c
2,2.0,,
4,,40.0,d


In [64]:
df_ref = pd.DataFrame({'i':[0,  1, 3, 4], 
                       'v':[10,20,30,40],
                       's':['a','b','c','d']})

df = pd.DataFrame({"i":[0,0,1,3,2] })

#df.merge(df_ref, left_on='i1', right_on='i', how="left")
df.merge(df_ref, on='i', how="left")

Unnamed: 0,i,v,s
0,0,10.0,a
1,0,10.0,a
2,1,20.0,b
3,3,30.0,c
4,2,,


In [120]:
from collections import Counter
c1 = Counter({1:1, 2:2})
c2 = Counter({1:3, 4:1})

c1 += c2
c1

Counter({1: 4, 2: 2, 4: 1})

In [42]:
df = pd.DataFrame({ "Num":  1,                
                    "Name": ["Jon",  "Sam", "Mia",],   
                    "Age" : [43,    np.nan, 56], 
                    "Cat": pd.Categorical(["b", "b", "a"]) },
                    index=['m', 'f', 'u'])         
df = df.sort_values(["Cat"])
df.loc[['m','u'],['Name', 'Cat']]

Unnamed: 0,Name,Cat
m,Jon,b
u,Mia,a


## Группы

In [61]:
df = pd.DataFrame({
    'A': ['a','b','a','b','b'],
    'B': ['0', '1', '0',  '1', '0'],
    'X': [ 1,  2,  3,  4,  5 ],
    'Y': [ 0,  1,  1,  0,  0 ]
} )


df.groupby("A").X.sum()

A
a     4
b    11
Name: X, dtype: int64