# Pandas Serisi Oluşturmak

In [2]:
import pandas as pd

In [3]:
pd.Series([10,88,3,4,5])

0    10
1    88
2     3
3     4
4     5
dtype: int64

In [4]:
pd.Series([99,22,332,94,5], index = [1,3,5,7,9])

1     99
3     22
5    332
7     94
9      5
dtype: int64

In [5]:
seri = pd.Series([99,22,332,94,5], index = ['a', 'b','c','d','e'])

In [6]:
seri['a':'c']

a     99
b     22
c    332
dtype: int64

In [7]:
#sozluk uzerinden liste olusturmak

In [8]:
sozluk = {'reg': 10,
          'log': 11,
          'cart': 12}

In [9]:
seri = pd.Series(sozluk)

In [10]:
seri

reg     10
log     11
cart    12
dtype: int64

In [11]:
#iki seriyi birlestirerek seri olusturma

In [12]:
pd.concat([seri,seri])

reg     10
log     11
cart    12
reg     10
log     11
cart    12
dtype: int64

In [13]:
pd.concat([seri,seri])

reg     10
log     11
cart    12
reg     10
log     11
cart    12
dtype: int64

# Eleman Islemleri

In [15]:
import numpy as np
a = np.array([1,2,33,444,75])
seri = pd.Series(a)
seri

0      1
1      2
2     33
3    444
4     75
dtype: int64

In [16]:
seri[0]

1

In [17]:
seri[0:3]

0     1
1     2
2    33
dtype: int64

In [18]:
seri = pd.Series([121,200,150,99],
                 index = ['reg', 'loj', 'cart', 'rf'])

In [19]:
seri

reg     121
loj     200
cart    150
rf       99
dtype: int64

In [20]:
list(seri.items())

[('reg', 121), ('loj', 200), ('cart', 150), ('rf', 99)]

In [21]:
seri.values

array([121, 200, 150,  99])

In [22]:
#eleman sorgulama

In [23]:
'reg' in seri

True

In [24]:
seri['reg']

121

In [25]:
#fancy eleman

In [26]:
seri[['rf', 'reg']]

rf      99
reg    121
dtype: int64

In [27]:
seri['reg'] = 130

In [28]:
seri['reg']

130

In [29]:
seri['reg':'loj']

reg    130
loj    200
dtype: int64

# Pandas DataFrame Olusturma

In [31]:
import pandas as pd

In [32]:
l = [ 1,2,39,67,90]

In [33]:
l

[1, 2, 39, 67, 90]

In [34]:
pd.DataFrame(l, columns = ['degisken_ismi'])

Unnamed: 0,degisken_ismi
0,1
1,2
2,39
3,67
4,90


In [35]:
import numpy as np
m = np.arange(1,10).reshape((3,3))

In [36]:
pd.DataFrame(m, columns = ['var1', 'var2', 'var3'])

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [37]:
#df isimlendirme

In [38]:
df = pd.DataFrame(m, columns = ['var1', 'var2', 'var3'])

In [39]:
df

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [40]:
df.columns = ('deg1', 'deg2', 'deg3')

In [41]:
df

Unnamed: 0,deg1,deg2,deg3
0,1,2,3
1,4,5,6
2,7,8,9


In [42]:
type(df)

pandas.core.frame.DataFrame

In [43]:
df.axes

[RangeIndex(start=0, stop=3, step=1),
 Index(['deg1', 'deg2', 'deg3'], dtype='object')]

In [44]:
df.shape

(3, 3)

In [45]:
df.ndim

2

In [46]:
df.size

9

In [47]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [48]:
type(df.values)

numpy.ndarray

In [49]:
df.head()

Unnamed: 0,deg1,deg2,deg3
0,1,2,3
1,4,5,6
2,7,8,9


In [50]:
df.tail()

Unnamed: 0,deg1,deg2,deg3
0,1,2,3
1,4,5,6
2,7,8,9


In [51]:
a = np.array([1,2,3,4,5])

In [52]:
pd.DataFrame(a, columns = ['deg1'])

Unnamed: 0,deg1
0,1
1,2
2,3
3,4
4,5


# Eleman İşlemleri

In [54]:
import numpy as np
s1 = np.random.randint(10, size = 5)
s2 = np.random.randint(10, size = 5)
s3 = np.random.randint(10, size = 5)

In [55]:
sozluk = {'var1': s1, 'var2': s2, 'var3': s3}

In [56]:
sozluk

{'var1': array([0, 8, 5, 0, 5]),
 'var2': array([5, 0, 5, 3, 7]),
 'var3': array([4, 7, 0, 7, 9])}

In [57]:
df = pd.DataFrame(sozluk)

In [58]:
df

Unnamed: 0,var1,var2,var3
0,0,5,4
1,8,0,7
2,5,5,0
3,0,3,7
4,5,7,9


In [59]:
df[0:1]

Unnamed: 0,var1,var2,var3
0,0,5,4


In [60]:
df.index = ['a', 'b', 'c', 'd', 'e']

In [61]:
df

Unnamed: 0,var1,var2,var3
a,0,5,4
b,8,0,7
c,5,5,0
d,0,3,7
e,5,7,9


In [62]:
df['c':'e']

Unnamed: 0,var1,var2,var3
c,5,5,0
d,0,3,7
e,5,7,9


In [63]:
#silme

In [64]:
df.drop('a', axis = 0)

Unnamed: 0,var1,var2,var3
b,8,0,7
c,5,5,0
d,0,3,7
e,5,7,9


In [65]:
df

Unnamed: 0,var1,var2,var3
a,0,5,4
b,8,0,7
c,5,5,0
d,0,3,7
e,5,7,9


In [66]:
df.drop('a', axis = 0, inplace = True)

In [67]:
df

Unnamed: 0,var1,var2,var3
b,8,0,7
c,5,5,0
d,0,3,7
e,5,7,9


In [68]:
#fancy

In [69]:
l = ['c', 'e']

In [70]:
df.drop(l, axis = 0)

Unnamed: 0,var1,var2,var3
b,8,0,7
d,0,3,7


In [71]:
#degiskenler icin

In [72]:
'var1' in df

True

In [73]:
l = ['var1', 'var4', 'var2']

In [74]:
for i in l:
    print(i in df)

True
False
True


In [75]:
df

Unnamed: 0,var1,var2,var3
b,8,0,7
c,5,5,0
d,0,3,7
e,5,7,9


In [76]:
df['var4'] = df['var1'] / df['var2']

In [77]:
df

Unnamed: 0,var1,var2,var3,var4
b,8,0,7,inf
c,5,5,0,1.0
d,0,3,7,0.0
e,5,7,9,0.714286


In [78]:
df.drop('var4', axis = 1, inplace = True)

In [79]:
df

Unnamed: 0,var1,var2,var3
b,8,0,7
c,5,5,0
d,0,3,7
e,5,7,9


In [80]:
l = ['var1', 'var2']

In [81]:
df.drop(l, axis = 1)

Unnamed: 0,var3
b,7
c,0
d,7
e,9


# Gozlem ve Degisken Secimi: loc ve iloc

In [83]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size = (10,3))
df = pd.DataFrame(m,columns = ['var1', 'var2', 'var3'])
df

Unnamed: 0,var1,var2,var3
0,23,12,11
1,9,6,13
2,3,5,3
3,28,23,19
4,25,1,6
5,13,12,19
6,19,7,4
7,25,28,5
8,28,5,5
9,13,8,16


In [84]:
#loc: tanimlandigi sekli ile secim yapmak icin kullanılır

In [85]:
df.loc[0:3]

Unnamed: 0,var1,var2,var3
0,23,12,11
1,9,6,13
2,3,5,3
3,28,23,19


In [86]:
#iloc: alisik oldugumuz indexleme mantigiyla secim yapar

In [87]:
df.iloc[0:3]

Unnamed: 0,var1,var2,var3
0,23,12,11
1,9,6,13
2,3,5,3


# Kosullu Eleman Islemleri

In [89]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size = (10,3))
df = pd.DataFrame(m, columns = ['var1', 'var2', 'var3'])
df

Unnamed: 0,var1,var2,var3
0,25,28,26
1,24,2,18
2,9,15,19
3,17,3,23
4,2,28,8
5,19,4,8
6,23,23,16
7,7,14,1
8,10,5,12
9,5,6,1


In [90]:
df['var1']

0    25
1    24
2     9
3    17
4     2
5    19
6    23
7     7
8    10
9     5
Name: var1, dtype: int64

In [91]:
df['var1'][0:2]

0    25
1    24
Name: var1, dtype: int64

In [92]:
df[['var1','var2']][0:2]

Unnamed: 0,var1,var2
0,25,28
1,24,2


In [93]:
df[df.var1 > 15]

Unnamed: 0,var1,var2,var3
0,25,28,26
1,24,2,18
3,17,3,23
5,19,4,8
6,23,23,16


In [94]:
df[df.var1 > 15]['var2']

0    28
1     2
3     3
5     4
6    23
Name: var2, dtype: int64

In [95]:
#birden fazla koşul

In [96]:
df[(df.var1 > 15) & (df.var3 < 5)]

Unnamed: 0,var1,var2,var3


In [97]:
df.loc[(df.var1 > 15), ['var1', 'var2']] #loc olmadan çalıştırdığımızda hata alıyoruz

Unnamed: 0,var1,var2
0,25,28
1,24,2
3,17,3
5,19,4
6,23,23


In [98]:
df[(df.var1 > 15)][['var1', 'var3']] #loc olmadn bu şekilde çalıştırılabilir

Unnamed: 0,var1,var3
0,25,26
1,24,18
3,17,23
5,19,8
6,23,16


# Birlestirme (Join) Islemleri

In [100]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size = (10,3))
df1 = pd.DataFrame(m, columns = ['var1', 'var2', 'var3'])
df1

Unnamed: 0,var1,var2,var3
0,3,20,4
1,28,22,15
2,17,18,3
3,15,23,12
4,18,28,15
5,9,6,16
6,28,15,10
7,5,15,5
8,9,23,12
9,5,5,5


In [101]:
df2 = df1 + 99

In [102]:
pd.concat([df1, df2])

Unnamed: 0,var1,var2,var3
0,3,20,4
1,28,22,15
2,17,18,3
3,15,23,12
4,18,28,15
5,9,6,16
6,28,15,10
7,5,15,5
8,9,23,12
9,5,5,5


In [103]:
pd.concat([df1, df2], ignore_index = True)

Unnamed: 0,var1,var2,var3
0,3,20,4
1,28,22,15
2,17,18,3
3,15,23,12
4,18,28,15
5,9,6,16
6,28,15,10
7,5,15,5
8,9,23,12
9,5,5,5


In [104]:
df1.columns

Index(['var1', 'var2', 'var3'], dtype='object')

In [105]:
df2.columns = ['var1', 'var2', 'deg3']

In [106]:
pd.concat([df1,df2], ignore_index = True, join = 'inner')

Unnamed: 0,var1,var2
0,3,20
1,28,22
2,17,18
3,15,23
4,18,28
5,9,6
6,28,15
7,5,15
8,9,23
9,5,5


In [107]:
df1

Unnamed: 0,var1,var2,var3
0,3,20,4
1,28,22,15
2,17,18,3
3,15,23,12
4,18,28,15
5,9,6,16
6,28,15,10
7,5,15,5
8,9,23,12
9,5,5,5


# Ileri Birlestirme Islemleri

In [109]:
#birebir birlestirme

In [110]:
df1 = pd.DataFrame({'calisanlar': ['Ali', 'Veli', 'Ayse', 'Fatma'],
                    'grup' : ['Muhasebe', 'Muhendislik', 'Muhendislik', 'IK']})
df1

Unnamed: 0,calisanlar,grup
0,Ali,Muhasebe
1,Veli,Muhendislik
2,Ayse,Muhendislik
3,Fatma,IK


In [111]:
 df2 = pd.DataFrame({'calisanlar': ['Ayse', 'Ali', 'Veli', 'Fatma'],
                    'ilk_giris': [2010, 2009, 2014, 2019]})
df2

Unnamed: 0,calisanlar,ilk_giris
0,Ayse,2010
1,Ali,2009
2,Veli,2014
3,Fatma,2019


In [112]:
pd.merge(df1, df2)

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ali,Muhasebe,2009
1,Veli,Muhendislik,2014
2,Ayse,Muhendislik,2010
3,Fatma,IK,2019


In [113]:
pd.merge(df1, df2, on = 'calisanlar')

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ali,Muhasebe,2009
1,Veli,Muhendislik,2014
2,Ayse,Muhendislik,2010
3,Fatma,IK,2019


In [114]:
#coktan teke

In [115]:
df3 = pd.merge(df1, df2)

In [116]:
df3

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ali,Muhasebe,2009
1,Veli,Muhendislik,2014
2,Ayse,Muhendislik,2010
3,Fatma,IK,2019


In [117]:
df4 = pd.DataFrame({'grup': ['Muhasebe', 'Muhendislik', 'IK'],
                    'mudur': ['Caner', 'Mustafa', 'Berkcan']})
df4

Unnamed: 0,grup,mudur
0,Muhasebe,Caner
1,Muhendislik,Mustafa
2,IK,Berkcan


In [118]:
pd.merge(df3,df4)

Unnamed: 0,calisanlar,grup,ilk_giris,mudur
0,Ali,Muhasebe,2009,Caner
1,Veli,Muhendislik,2014,Mustafa
2,Ayse,Muhendislik,2010,Mustafa
3,Fatma,IK,2019,Berkcan


In [119]:
#coktan coka

In [120]:
df5 = pd.DataFrame({'grup': ['Muhasebe', 'Muhasebe', 'Muhendislik', 'Muhendislik', 'IK', 'IK'],
                    'yetenekler': ['matematik', 'excel', 'kodlama', 'linux', 'excel', 'yonetim']})
df5

Unnamed: 0,grup,yetenekler
0,Muhasebe,matematik
1,Muhasebe,excel
2,Muhendislik,kodlama
3,Muhendislik,linux
4,IK,excel
5,IK,yonetim


In [121]:
df1

Unnamed: 0,calisanlar,grup
0,Ali,Muhasebe
1,Veli,Muhendislik
2,Ayse,Muhendislik
3,Fatma,IK


In [122]:
pd.merge(df1, df5)

Unnamed: 0,calisanlar,grup,yetenekler
0,Ali,Muhasebe,matematik
1,Ali,Muhasebe,excel
2,Veli,Muhendislik,kodlama
3,Veli,Muhendislik,linux
4,Ayse,Muhendislik,kodlama
5,Ayse,Muhendislik,linux
6,Fatma,IK,excel
7,Fatma,IK,yonetim


# Toplulastirma ve Gruplama (Aggregation & Grouping)

Basit toplulaştırma fonksiyonları:

* count()
* first()
* last()
* mean()
* median()
* min()
* max()
* std()
* var()
* sum()

In [125]:
import seaborn as sns

In [126]:
?sns.load_dataset

[0;31mSignature:[0m [0msns[0m[0;34m.[0m[0mload_dataset[0m[0;34m([0m[0mname[0m[0;34m,[0m [0mcache[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mdata_home[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkws[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Load an example dataset from the online repository (requires internet).

This function provides quick access to a small number of example datasets
that are useful for documenting seaborn or generating reproducible examples
for bug reports. It is not necessary for normal usage.

Note that some of the datasets have a small amount of preprocessing applied
to define a proper ordering for categorical variables.

Use :func:`get_dataset_names` to see a list of available datasets.

Parameters
----------
name : str
    Name of the dataset (``{name}.csv`` on
    https://github.com/mwaskom/seaborn-data).
cache : boolean, optional
    If True, try to load from the local cache first, and save to the cache
 

In [127]:
df = sns.load_dataset('planets')
df

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [128]:
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [129]:
df.shape

(1035, 6)

In [130]:
df.mean

<bound method DataFrame.mean of                method  number  orbital_period   mass  distance  year
0     Radial Velocity       1      269.300000   7.10     77.40  2006
1     Radial Velocity       1      874.774000   2.21     56.95  2008
2     Radial Velocity       1      763.000000   2.60     19.84  2011
3     Radial Velocity       1      326.030000  19.40    110.62  2007
4     Radial Velocity       1      516.220000  10.50    119.47  2009
...               ...     ...             ...    ...       ...   ...
1030          Transit       1        3.941507    NaN    172.00  2006
1031          Transit       1        2.615864    NaN    148.00  2007
1032          Transit       1        3.191524    NaN    174.00  2007
1033          Transit       1        4.125083    NaN    293.00  2008
1034          Transit       1        4.187757    NaN    260.00  2008

[1035 rows x 6 columns]>

In [131]:
df['mass'].mean()

2.6381605847953216

In [132]:
df.count()

method            1035
number            1035
orbital_period     992
mass               513
distance           808
year              1035
dtype: int64

In [133]:
kat_df = df.select_dtypes(include = ["object"])

In [134]:
df.describe() #betimsel istatistik

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [135]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,1035.0,1.785507,1.240976,1.0,1.0,1.0,2.0,7.0
orbital_period,992.0,2002.917596,26014.728304,0.090706,5.44254,39.9795,526.005,730000.0
mass,513.0,2.638161,3.818617,0.0036,0.229,1.26,3.04,25.0
distance,808.0,264.069282,733.116493,1.35,32.56,55.25,178.5,8500.0
year,1035.0,2009.070531,3.972567,1989.0,2007.0,2010.0,2012.0,2014.0


# Gruplama Islemleri

In [137]:
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                   'veri': [10,11,52,23,43,55]}, columns = ['gruplar', 'veri'])
df

Unnamed: 0,gruplar,veri
0,A,10
1,B,11
2,C,52
3,A,23
4,B,43
5,C,55


In [138]:
 df.groupby('gruplar')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x160b06b90>

In [139]:
 df.groupby('gruplar').mean()

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,16.5
B,27.0
C,53.5


In [140]:
df = sns.load_dataset('planets')
df

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [141]:
df.groupby('method').mean()

Unnamed: 0_level_0,number,orbital_period,mass,distance,year
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Astrometry,1.0,631.18,,17.875,2011.5
Eclipse Timing Variations,1.666667,4751.644444,5.125,315.36,2010.0
Imaging,1.315789,118247.7375,,67.715937,2009.131579
Microlensing,1.173913,3153.571429,,4144.0,2009.782609
Orbital Brightness Modulation,1.666667,0.709307,,1180.0,2011.666667
Pulsar Timing,2.2,7343.021201,,1200.0,1998.4
Pulsation Timing Variations,1.0,1170.0,,,2007.0
Radial Velocity,1.721519,823.35468,2.630699,51.600208,2007.518987
Transit,1.95466,21.102073,1.47,599.29808,2011.236776
Transit Timing Variations,2.25,79.7835,,1104.333333,2012.5


In [142]:
df.groupby('method')['orbital_period'].mean()

method
Astrometry                          631.180000
Eclipse Timing Variations          4751.644444
Imaging                          118247.737500
Microlensing                       3153.571429
Orbital Brightness Modulation         0.709307
Pulsar Timing                      7343.021201
Pulsation Timing Variations        1170.000000
Radial Velocity                     823.354680
Transit                              21.102073
Transit Timing Variations            79.783500
Name: orbital_period, dtype: float64

In [143]:
df.groupby('method')['orbital_period'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,631.18,544.217663,246.36,438.77,631.18,823.59,1016.0
Eclipse Timing Variations,9.0,4751.644444,2499.130945,1916.25,2900.0,4343.5,5767.0,10220.0
Imaging,12.0,118247.7375,213978.177277,4639.15,8343.9,27500.0,94250.0,730000.0
Microlensing,7.0,3153.571429,1113.166333,1825.0,2375.0,3300.0,3550.0,5100.0
Orbital Brightness Modulation,3.0,0.709307,0.725493,0.240104,0.291496,0.342887,0.943908,1.544929
Pulsar Timing,5.0,7343.021201,16313.265573,0.090706,25.262,66.5419,98.2114,36525.0
Pulsation Timing Variations,1.0,1170.0,,1170.0,1170.0,1170.0,1170.0,1170.0
Radial Velocity,553.0,823.35468,1454.92621,0.73654,38.021,360.2,982.0,17337.5
Transit,397.0,21.102073,46.185893,0.355,3.16063,5.714932,16.1457,331.60059
Transit Timing Variations,3.0,79.7835,71.599884,22.3395,39.67525,57.011,108.5055,160.0


# Ileri Toplulastirma Islemi

In [145]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'degisken1': [10,23,33,22,11,99],
                   'degisken2': [100,253,333,262,111,969]},
                  columns = ['gruplar', 'degisken1', 'degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [146]:
#aggregate

In [147]:
df.groupby('gruplar').mean()

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16.0,181.0
B,17.0,182.0
C,66.0,651.0


In [148]:
df.groupby('gruplar').aggregate(['min', np.median, max ])

  df.groupby('gruplar').aggregate(['min', np.median, max ])
  df.groupby('gruplar').aggregate(['min', np.median, max ])


Unnamed: 0_level_0,degisken1,degisken1,degisken1,degisken2,degisken2,degisken2
Unnamed: 0_level_1,min,median,max,min,median,max
gruplar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,10,16.0,22,100,181.0,262
B,11,17.0,23,111,182.0,253
C,33,66.0,99,333,651.0,969


In [149]:
df.groupby('gruplar').aggregate({'degisken1': 'min',
                                'degisken2': 'max'})

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,10,262
B,11,253
C,33,969


# filter

In [151]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                   'degisken1': [10,23,33,22,11,99],
                   'degisken2': [100,253,333,262,111,969]},
                    columns = ['gruplar', 'degisken1', 'degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [152]:
df.groupby('gruplar').std()

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,8.485281,114.551299
B,8.485281,100.409163
C,46.669048,449.719913


In [153]:
def filter_func(x):
    return x['degisken1'].std() > 9

In [154]:
df.groupby('gruplar').filter(filter_func)

Unnamed: 0,gruplar,degisken1,degisken2
2,C,33,333
5,C,99,969


# transform

In [156]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                   'degisken1': [10,23,33,22,11,99],
                   'degisken2': [100,253,333,262,111,969]}, 
                    columns = ['gruplar', 'degisken1', 'degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [157]:
df['degisken1']*9

0     90
1    207
2    297
3    198
4     99
5    891
Name: degisken1, dtype: int64

In [158]:
df_a = df.iloc[:,1:3]

In [159]:
df_a.transform(lambda x: x-x.mean())

Unnamed: 0,degisken1,degisken2
0,-23.0,-238.0
1,-10.0,-85.0
2,0.0,-5.0
3,-11.0,-76.0
4,-22.0,-227.0
5,66.0,631.0


In [160]:
df_a.transform(lambda x: (x-x.mean() / x.std()))

Unnamed: 0,degisken1,degisken2
0,9.013055,98.951261
1,22.013055,251.951261
2,32.013055,331.951261
3,21.013055,260.951261
4,10.013055,109.951261
5,98.013055,967.951261


# apply

In [162]:
import numpy as np
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                   'degisken1': [10,23,33,22,11,99],
                   'degisken2': [100,253,333,262,111,969]}, 
                    columns = ['gruplar', 'degisken1', 'degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [163]:
df_a = df.iloc[:,1:3]

In [164]:
df_a.apply(np.sum)

degisken1     198
degisken2    2028
dtype: int64

In [165]:
df_a.apply(np.mean)

degisken1     33.0
degisken2    338.0
dtype: float64

In [166]:
df.groupby('gruplar').apply(np.sum)

  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


Unnamed: 0_level_0,gruplar,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,AA,32,362
B,BB,34,364
C,CC,132,1302


# Pivot Tablolar

In [168]:
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [169]:
titanic.groupby('sex')['survived'].mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [170]:
titanic.groupby('sex')[['survived']].mean() #iki köşeli parantez ile dataframe yapabiliriz

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [171]:
titanic.groupby(['sex', 'class'])[['survived']].aggregate('mean')

  titanic.groupby(['sex', 'class'])[['survived']].aggregate('mean')


Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,class,Unnamed: 2_level_1
female,First,0.968085
female,Second,0.921053
female,Third,0.5
male,First,0.368852
male,Second,0.157407
male,Third,0.135447


In [172]:
titanic.groupby(['sex', 'class'])[['survived']].aggregate('mean').unstack()

  titanic.groupby(['sex', 'class'])[['survived']].aggregate('mean').unstack()


Unnamed: 0_level_0,survived,survived,survived
class,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [173]:
#pivot ile table

In [174]:
titanic.pivot_table('survived', index = 'sex', columns = 'class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [175]:
titanic.age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [176]:
age = pd.cut(titanic['age'], [0,18,90])
age.head(10)

0    (18.0, 90.0]
1    (18.0, 90.0]
2    (18.0, 90.0]
3    (18.0, 90.0]
4    (18.0, 90.0]
5             NaN
6    (18.0, 90.0]
7     (0.0, 18.0]
8    (18.0, 90.0]
9     (0.0, 18.0]
Name: age, dtype: category
Categories (2, interval[int64, right]): [(0, 18] < (18, 90]]

In [177]:
titanic.pivot_table('survived', ['sex', age], 'class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 90]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 90]",0.375,0.071429,0.133663


# Dis Kaynaklı Veri Okumak

In [179]:
import pandas as pd

In [180]:
#csv okuma
pd.read_csv('reading_data/ornekcsv.csv', sep = ';')

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
5,6,2,
6,56,11,6.0
7,7,12,7.0
8,56,21,7.0
9,346,2,8.0


In [181]:
#txt okuma
pd.read_csv('reading_data/duz_metin.txt')

Unnamed: 0,1 2
0,2 2
1,3 2
2,4 2
3,5 2
4,6 2
5,7 2
6,8 2
7,9 2
8,10 2


In [182]:
df = pd.read_excel('reading_data/ornekx.xlsx')

In [183]:
df.head()

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0


In [184]:
df.columns = ('A','B','C')

In [185]:
df

Unnamed: 0,A,B,C
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
5,6,2,
6,56,11,6.0
7,7,12,7.0
8,56,21,7.0
9,346,2,8.0
