In [1]:
import pandas as pd

## Pandas

Pandas – это библиотека для обработки и анализа данных. В pandas есть
две структуры объекта: DataFrame и Series. Series – это аналог одномерного
массива, который имеет индексы, по сути, это ассоциативный массив, как
словарь. DataFrame имеет табличную структуру, столбцы у DataFrame – это
Series.

### Series

In [2]:
S = pd.Series(["Hello", "World"])
print(S)
print("-"*15)

print(S.index)
print([x for x in S.index])

print(S.values)

0    Hello
1    World
dtype: object
---------------
RangeIndex(start=0, stop=2, step=1)
[0, 1]
['Hello' 'World']


#### Индексы можно задавать

In [3]:
new_s = pd.Series(S.values, index=["s1", "s2"])
new_s["3"] = 3 # добавление нового значения
print(new_s)

s1    Hello
s2    World
3         3
dtype: object


In [4]:
# групповое присваивание

new_s[['s1', 's2']] = 1, 2
print(new_s)

s1    1
s2    2
3     3
dtype: object


#### Вывод элементов по условию

In [5]:
print(new_s[new_s==1], end='\n\n')
print(new_s[new_s>1])


s1    1
dtype: object

s2    2
3     3
dtype: object


### DataFrame

### Чтение и создание

In [6]:
pd.read_csv('datasets/test.txt', sep=",", dtype=float)

Unnamed: 0,a,b,c,d
0,1.0,2.0,3.0,4.0
1,3.0,6.0,8.0,16.0
2,2.0,5.0,3.0,9.0


In [7]:
pd.DataFrame({'A': range(5), 'B': range(5)}, dtype=float)

Unnamed: 0,A,B
0,0.0,0.0
1,1.0,1.0
2,2.0,2.0
3,3.0,3.0
4,4.0,4.0


In [8]:
df = pd.read_csv('datasets/World_Population.csv')
df

Unnamed: 0,Country,Population 2024,Population 2023,Area (km2),Density (/km2),Growth Rate,World %,World Rank
0,India,1441719852,1428627663,3M,485.00,0.0092,0.1801,1
1,China,1425178782,1425671352,9.4M,151.00,-0.0003,0.1780,2
2,United States,341814420,339996563,9.1M,37.00,0.0053,0.0427,3
3,Indonesia,279798049,277534122,1.9M,149.00,0.0082,0.0350,4
4,Pakistan,245209815,240485658,770.9K,318.00,0.0196,0.0306,5
...,...,...,...,...,...,...,...,...
229,Montserrat,4372,4386,102,43.00,-0.0032,,230
230,Falkland Islands,3803,3791,12.2K,0.31,0.0032,,231
231,Niue,1935,1935,261,7.00,0.0000,,232
232,Tokelau,1915,1893,10,192.00,0.0116,,233


### Описание

In [9]:
print(df.columns)
print(df.index)
print(df.values)

Index(['Country', 'Population 2024', 'Population 2023', 'Area (km2)',
       'Density (/km2)', 'Growth Rate', 'World %', 'World Rank'],
      dtype='object')
RangeIndex(start=0, stop=234, step=1)
[['India' 1441719852 1428627663 ... 0.0092 0.1801 1]
 ['China' 1425178782 1425671352 ... -0.0003 0.178 2]
 ['United States' 341814420 339996563 ... 0.0053 0.0427 3]
 ...
 ['Niue' 1935 1935 ... 0.0 nan 232]
 ['Tokelau' 1915 1893 ... 0.0116 nan 233]
 ['Vatican City' 526 518 ... 0.0154 nan 234]]


In [10]:
print(df.describe())

       Population 2024  Population 2023  Density (/km2)  Growth Rate  \
count     2.340000e+02     2.340000e+02      234.000000   234.000000   
mean      3.468862e+07     3.437442e+07      453.788248     0.009200   
std       1.380750e+08     1.373864e+08     1990.163274     0.011371   
min       5.260000e+02     5.180000e+02        0.140000    -0.030900   
25%       4.264565e+05     4.225982e+05       39.500000     0.001925   
50%       5.626359e+06     5.643895e+06       98.500000     0.007950   
75%       2.392272e+07     2.324537e+07      248.250000     0.015675   
max       1.441720e+09     1.428628e+09    21674.000000     0.048300   

          World %  World Rank  
count  228.000000  234.000000  
mean     0.004446  117.500000  
std      0.017459   67.694165  
min      0.000000    1.000000  
25%      0.000100   59.250000  
50%      0.000750  117.500000  
75%      0.003000  175.750000  
max      0.180100  234.000000  


### Ввод и вывод

In [11]:
new_df = pd.DataFrame({str(x): [y for y in range(6)] for x in range(1, 5)})
new_df

Unnamed: 0,1,2,3,4
0,0,0,0,0
1,1,1,1,1
2,2,2,2,2
3,3,3,3,3
4,4,4,4,4
5,5,5,5,5


In [12]:
new_df['out'] = new_df['1'] + new_df['2']
new_df

Unnamed: 0,1,2,3,4,out
0,0,0,0,0,0
1,1,1,1,1,2
2,2,2,2,2,4
3,3,3,3,3,6
4,4,4,4,4,8
5,5,5,5,5,10


In [13]:
new_df[(new_df['3'] > 2) & (new_df['1'] < 5)]

Unnamed: 0,1,2,3,4,out
3,3,3,3,3,6
4,4,4,4,4,8


#### Индексация по позиции

In [14]:
new_df.iloc[[0, 1], [2, 3, 4]]

Unnamed: 0,3,4,out
0,0,0,0
1,1,1,2


#### Индексация по имени

In [15]:
new_df.loc[[0, 1], ['3', '4', 'out']]

Unnamed: 0,3,4,out
0,0,0,0
1,1,1,2


#### Фильтрация

In [16]:
new_df.loc[(new_df.out > 8) | (new_df['2'] < 2)]

Unnamed: 0,1,2,3,4,out
0,0,0,0,0,0
1,1,1,1,1,2
5,5,5,5,5,10


### Apply

Метод apply() позволяет применить к серии или датафрейму некоторую
функцию

In [17]:
df = pd.DataFrame({str(x): [y for y in range(1, 4)] for x in range(1, 5)})
df

Unnamed: 0,1,2,3,4
0,1,1,1,1
1,2,2,2,2
2,3,3,3,3


In [18]:
def function(x):
    if x['1'] > 2:
        x['new'] = x['1'] ** 2
    else:
        x['new'] = x['1']
    return x

axis = 1 – это проход по строкам, axis = 0 – это проход по столбцам

In [19]:
df.apply(function, axis=1)

Unnamed: 0,1,2,3,4,new
0,1,1,1,1,1
1,2,2,2,2,2
2,3,3,3,3,9


In [20]:
df = pd.DataFrame({str(x): [y for y in range(1, 4)] for x in range(1, 5)})
df

Unnamed: 0,1,2,3,4
0,1,1,1,1
1,2,2,2,2
2,3,3,3,3


In [21]:
df.loc[1, :] = df.apply(lambda col: 1 if col[1]==2 else 0, axis=0)
df

Unnamed: 0,1,2,3,4
0,1,1,1,1
1,1,1,1,1
2,3,3,3,3
