In [34]:
import numpy as np
import pandas as pd

In [35]:
pd.__version__
pd.__name__

'pandas'

In [36]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], 
                  index=['cobra', 'viper', 'sidewinder'],
                  columns=['max_speed', 'shield'])
df.loc['cobra'][1]

2

In [37]:
pd.DataFrame({
              'countries': ['Russia', 'United Kingdom', 'USA'],
              'population': [145, 67, 340]              
              }, 
             index=['RU', 'UK', 'USA'])


Unnamed: 0,countries,population
RU,Russia,145
UK,United Kingdom,67
USA,USA,340


In [38]:
countries = pd.DataFrame(data=[
    ['Russia', 145],
    ['United Kingdom', 67],
    ['USA', 340]
],
    columns=['Country', 'population'],
    index=['Ru', 'UK', 'USA']
)

countries.mean(axis=1)

  countries.mean(axis=1)


Ru     145.0
UK      67.0
USA    340.0
dtype: float64

In [39]:
# Доступ к столбцу Dynamic member lookup
countries.population

Ru     145
UK      67
USA    340
Name: population, dtype: int64

In [40]:
melb_data = pd.read_csv('melb_data.csv', sep=',')
melb_data.loc[3521, 'Landsize'] / melb_data.loc[1690, 'Landsize']

2.7857142857142856

In [41]:
melb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          13580 non-null  int64  
 1   Suburb         13580 non-null  object 
 2   Address        13580 non-null  object 
 3   Rooms          13580 non-null  int64  
 4   Type           13580 non-null  object 
 5   Price          13580 non-null  float64
 6   Method         13580 non-null  object 
 7   SellerG        13580 non-null  object 
 8   Date           13580 non-null  object 
 9   Distance       13580 non-null  float64
 10  Postcode       13580 non-null  int64  
 11  Bedroom        13580 non-null  float64
 12  Bathroom       13580 non-null  float64
 13  Car            13580 non-null  float64
 14  Landsize       13580 non-null  float64
 15  BuildingArea   13580 non-null  float64
 16  YearBuilt      13580 non-null  float64
 17  CouncilArea    12211 non-null  object 
 18  Lattit

In [42]:
#Direct subscript returns column as pd.Series
melb_data['Car'].astype(np.int16)
melb_data.describe().loc[:, ['Price', 'Car']]

Unnamed: 0,Price,Car
count,13580.0,13580.0
mean,1075684.0,1.611856
std,639310.7,0.960793
min,85000.0,0.0
25%,650000.0,1.0
50%,903000.0,2.0
75%,1330000.0,2.0
max,9000000.0,10.0


In [43]:
melb_data['Regionname'].value_counts(normalize=True)

Southern Metropolitan         0.345729
Northern Metropolitan         0.286451
Western Metropolitan          0.217084
Eastern Metropolitan          0.108321
South-Eastern Metropolitan    0.033137
Eastern Victoria              0.003903
Northern Victoria             0.003019
Western Victoria              0.002356
Name: Regionname, dtype: float64

In [44]:
melb_data.CouncilArea[pd.isna(melb_data.CouncilArea)].shape

(1369,)

In [45]:
melb_data.Regionname.value_counts()

Southern Metropolitan         4695
Northern Metropolitan         3890
Western Metropolitan          2948
Eastern Metropolitan          1471
South-Eastern Metropolitan     450
Eastern Victoria                53
Northern Victoria               41
Western Victoria                32
Name: Regionname, dtype: int64

In [46]:
# У некоторых различных адресов в таблице совпадают географические координаты
melb_data.Address.value_counts().shape[0] - melb_data.Coordinates.value_counts().shape[0]

281

In [47]:
melb_data.loc[:, ['Suburb']].value_counts()

Suburb            
Reservoir             359
Richmond              260
Bentleigh East        249
Preston               239
Brunswick             222
                     ... 
Attwood                 1
Beaconsfield Upper      1
Bacchus Marsh           1
Sandhurst               1
Officer                 1
Length: 314, dtype: int64

In [48]:
# Сколько процентов от общего количества домов составляют таунхаусы (тип объекта — t)?
melb_data.Type.value_counts(normalize=True)

h    0.695803
u    0.222165
t    0.082032
Name: Type, dtype: float64

In [49]:
### Filtration

In [50]:
mask = melb_data['Price'] > 2000000
melb_data.Address[mask]

80       112 Beaconsfield Pde
85          104 Richardson St
88             29 Faussett St
92                2 Dundas Pl
93               23 Finlay St
                 ...         
13521            44 Garton St
13523          69 Greville St
13553            20 Albert Cr
13555                3 Oak St
13578            96 Verdon St
Name: Address, Length: 1007, dtype: object

In [51]:
# Compound filter
# mask = 
melb_data[(melb_data['Price'] > 2000000) & (melb_data['BuildingArea'] > 100)]

Unnamed: 0,index,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,...,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Coordinates
80,80,Albert Park,112 Beaconsfield Pde,3,h,2850000.0,PI,Buxton,4/03/2017,3.3,...,0.0,211.0,198.0,1890.0,Port Phillip,-37.84810,144.94990,Southern Metropolitan,3280.0,"-37.8481, 144.9499"
85,85,Albert Park,104 Richardson St,4,h,2300000.0,S,Marshall,7/05/2016,3.3,...,1.0,153.0,180.0,1880.0,Port Phillip,-37.84470,144.95230,Southern Metropolitan,3280.0,"-37.8447, 144.9523"
88,88,Albert Park,29 Faussett St,2,h,2120000.0,S,Greg,10/09/2016,3.3,...,1.0,199.0,107.0,1900.0,Port Phillip,-37.84220,144.95540,Southern Metropolitan,3280.0,"-37.8422, 144.9554"
92,92,Albert Park,2 Dundas Pl,3,h,2615000.0,S,Cayzer,10/12/2016,3.3,...,1.0,177.0,181.0,1880.0,Port Phillip,-37.84150,144.95850,Southern Metropolitan,3280.0,"-37.8415, 144.9585"
93,93,Albert Park,23 Finlay St,5,h,2100000.0,S,Greg,10/12/2016,3.3,...,1.0,237.0,126.0,1970.0,Port Phillip,-37.84360,144.95570,Southern Metropolitan,3280.0,"-37.8436, 144.9557"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13518,13518,Parkdale,63 The Corso,4,h,2475000.0,PI,Buxton,26/08/2017,21.5,...,3.0,665.0,450.0,2016.0,,-37.99508,145.07544,South-Eastern Metropolitan,5087.0,"-37.99508, 145.07544"
13523,13523,Prahran,69 Greville St,4,h,2668000.0,S,Biggin,26/08/2017,4.6,...,2.0,383.0,126.0,1970.0,,-37.84879,144.98882,Southern Metropolitan,7717.0,"-37.84879, 144.98882"
13553,13553,Surrey Hills,20 Albert Cr,4,h,2720000.0,S,Kay,26/08/2017,10.2,...,2.0,1005.0,126.0,1920.0,,-37.82421,145.10352,Southern Metropolitan,5457.0,"-37.82421, 145.10352"
13555,13555,Surrey Hills,3 Oak St,4,h,3100000.0,VB,Marshall,26/08/2017,10.2,...,3.0,832.0,126.0,1970.0,,-37.83564,145.10919,Southern Metropolitan,5457.0,"-37.83564, 145.10919"


In [52]:
print(melb_data[melb_data['Type'] == 't'].Rooms.max())
# quitile
median = melb_data.Price.median()
q3 = melb_data.Price[melb_data.Price > median].median()

melb_data.Price.quantile(q=0.75) == q3

5


True

In [53]:
has_no_bathroom = (melb_data.Bathroom == 0).value_counts()[np.bool_(True)]

# Сколько в таблице melb_data объектов недвижимости, которые были проданы риелторской компанией Nelson и стоимость которых составила больше 3 миллионов?
mask = (melb_data.SellerG == 'Nelson') & (melb_data.Price > 3_000_000)
melb_data[mask].shape[0]

# Какова минимальная стоимость участка без здания (площадь здания равна 0) в таблице
melb_data.Price[melb_data.BuildingArea == 0].min()

# Какова средняя цена объектов недвижимости в таблице melb_data с ценой менее одного миллиона, в которых либо количество комнат больше пяти, либо здание моложе 2015 года
less_than_mill = melb_data[(melb_data.Price < 1_000_000) & ((melb_data.Rooms > 5) | (melb_data.YearBuilt > 2015))].Price.mean()
less_than_mill

# В каком районе Мельбурна чаще всего продаются виллы и коттеджи (тип здания — h) с ценой меньше трёх миллионов?
type_mask = (melb_data.Type == 'h') & (melb_data.Price < 3_000_000)
melb_data.Regionname[type_mask].value_counts()

Northern Metropolitan         2737
Southern Metropolitan         2520
Western Metropolitan          2286
Eastern Metropolitan          1167
South-Eastern Metropolitan     387
Eastern Victoria                50
Northern Victoria               41
Western Victoria                32
Name: Regionname, dtype: int64

### Final test

In [54]:
students = pd.read_csv('students_performance.csv', sep=',')
students.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [55]:
students_count = students.shape[0]
students.loc[155, 'writing score']

88

In [56]:
# Сколько суммарно пропущенных значений в таблице?
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [57]:
students['math score'].mean()

66.089

In [58]:
students['race/ethnicity'].value_counts().index

Index(['group C', 'group D', 'group B', 'group E', 'group A'], dtype='object')

In [59]:
# Каков средний балл по чтению у студентов, которые посещали курсы подготовки к экзаменам
mask = students['test preparation course'] == 'completed'
students['reading score'][mask].mean()

73.89385474860335

In [60]:
students['math score'].value_counts()[0]

1

In [61]:
print(students[students['lunch'] == 'free/reduced']['math score'].mean(),
students[students['lunch'] == 'standard']['math score'].mean())

58.92112676056338 70.03410852713178


In [62]:
# Каков процент студентов, родители которых имеют высшее образование уровня бакалавриата (bachelor's degree)?
mask = students['parental level of education'] == 'bachelor\'s degree'
mask.value_counts()[True] / mask.shape[0]

0.118

In [63]:
# Насколько медианный балл по письму у студентов в расовой группе А отличается от среднего балла по письму у студентов в расовой группе C?
(students[students['race/ethnicity'] == 'group A']['writing score'].median() - 
    students[students['race/ethnicity'] == 'group C']['writing score'].mean())

-5.827586206896555

In [None]:
students.where()