# Pandas Serisi Oluşturmak

In [1]:
import pandas as pd

In [2]:
pd.Series([1,2,3,4,5])

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
seri = pd.Series([1,2,3,4,5])

In [4]:
type(seri)

pandas.core.series.Series

In [5]:
seri.axes

[RangeIndex(start=0, stop=5, step=1)]

In [6]:
seri.dtype

dtype('int64')

In [7]:
seri.size

5

In [8]:
seri.ndim

1

In [9]:
seri.values

array([1, 2, 3, 4, 5])

In [10]:
seri.head()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [11]:
seri.head(3)

0    1
1    2
2    3
dtype: int64

In [13]:
seri.tail(3)

2    3
3    4
4    5
dtype: int64

## Index İsimlendirmesi

In [15]:
pd.Series([99,22,332,94,5])

0     99
1     22
2    332
3     94
4      5
dtype: int64

In [16]:
pd.Series([99,22,332,94,5], index =[1,3,5,7,9])

1     99
3     22
5    332
7     94
9      5
dtype: int64

In [17]:
pd.Series([99,22,332,94,5],index = ["a","b","c","d","e"])

a     99
b     22
c    332
d     94
e      5
dtype: int64

In [18]:
seri = pd.Series([99,22,332,94,5],index = ["a","b","c","d","e"])

In [19]:
seri["a"]

99

In [20]:
seri["a":"c"]

a     99
b     22
c    332
dtype: int64

## Sözlük Üzerinden Liste Oluşturmak

In [22]:
 sozluk = pd.Series ({"reg":10,"log":11,"cart":12})

In [23]:
sozluk

reg     10
log     11
cart    12
dtype: int64

## İki seriyi birleştirerek seri oluşturma

In [24]:
pd.concat([seri,seri])

a     99
b     22
c    332
d     94
e      5
a     99
b     22
c    332
d     94
e      5
dtype: int64

## Eleman İşlemleri

In [26]:
import numpy as np

In [27]:
a = np.array([1,2,33,444,5])
seri = pd.Series(a)
seri

0      1
1      2
2     33
3    444
4      5
dtype: int64

In [28]:
seri[0]

1

In [30]:
seri[0:3]

0     1
1     2
2    33
dtype: int64

In [32]:
seri = pd.Series([121,200,150,99],
                 index = ["reg","loj","cart","rf"] )

In [33]:
seri

reg     121
loj     200
cart    150
rf       99
dtype: int64

In [34]:
seri.index

Index(['reg', 'loj', 'cart', 'rf'], dtype='object')

In [35]:
seri.keys

<bound method Series.keys of reg     121
loj     200
cart    150
rf       99
dtype: int64>

In [36]:
list(seri.items())

[('reg', 121), ('loj', 200), ('cart', 150), ('rf', 99)]

In [37]:
seri.values

array([121, 200, 150,  99])

## Eleman Sorgulama

In [38]:
"reg" in seri

True

In [39]:
"a" in seri

False

In [40]:
seri["reg"]

121

## Fancy ile eleman seçme

In [41]:
serii = ["reg"]
seri[serii]

reg    121
dtype: int64

In [42]:
seri[["rf","reg"]]

rf      99
reg    121
dtype: int64

In [43]:
seri["reg"] = 130

In [44]:
seri["reg"]

130

In [45]:
seri["reg":"loj"]

reg    130
loj    200
dtype: int64

# Pandas DataFrame Oluşturma 

* Yapısal veritipidir
* Excel'e benzerdir

In [46]:
import pandas as pd

In [47]:
l = [1,2,39,67,90]

In [48]:
l

[1, 2, 39, 67, 90]

In [51]:
pd.DataFrame(l, columns = ["degisken_ismi"])

Unnamed: 0,degisken_ismi
0,1
1,2
2,39
3,67
4,90


In [54]:
import numpy as np
m = np.arange(1,10).reshape((3,3))
m

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [56]:
pd.DataFrame(m, columns = ["var1","var2","var3"])

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


## Df isimlendirme

In [57]:
df = pd.DataFrame(m, columns = ["var1","var2","var3"])

In [58]:
df.head()

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [60]:
df.columns =("deg1","deg2","deg3")

In [61]:
df

Unnamed: 0,deg1,deg2,deg3
0,1,2,3
1,4,5,6
2,7,8,9


**Özellikleri**

In [62]:
type(df)

pandas.core.frame.DataFrame

In [63]:
df.axes

[RangeIndex(start=0, stop=3, step=1),
 Index(['deg1', 'deg2', 'deg3'], dtype='object')]

In [64]:
df.shape

(3, 3)

In [65]:
df.ndim

2

In [66]:
df.size

9

In [67]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [68]:
type(df.values)

numpy.ndarray

In [69]:
df.head()

Unnamed: 0,deg1,deg2,deg3
0,1,2,3
1,4,5,6
2,7,8,9


In [71]:
df.tail(1)

Unnamed: 0,deg1,deg2,deg3
2,7,8,9


In [72]:
a = np.array([1,2,3,4,5])

In [73]:
pd.DataFrame(a, columns = ["deg1"])

Unnamed: 0,deg1
0,1
1,2
2,3
3,4
4,5


## Eleman İşlemleri

In [80]:
import numpy as np
s1 = np.random.randint(10,size = 5)
s2 = np.random.randint(10,size = 5)
s3 = np.random.randint(10,size = 5)

In [81]:
s1

array([6, 4, 2, 5, 8])

In [82]:
s2

array([2, 6, 6, 1, 4])

In [83]:
s3

array([1, 5, 6, 7, 9])

In [86]:
sozluk = {"var1" : s1,"var2" : s2,"var3" : s3 }

In [87]:
sozluk

{'var1': array([6, 4, 2, 5, 8]),
 'var2': array([2, 6, 6, 1, 4]),
 'var3': array([1, 5, 6, 7, 9])}

In [105]:
df = pd.DataFrame(sozluk)

In [89]:
df

Unnamed: 0,var1,var2,var3
0,6,2,1
1,4,6,5
2,2,6,6
3,5,1,7
4,8,4,9


In [90]:
df[0:1]

Unnamed: 0,var1,var2,var3
0,6,2,1


In [91]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [106]:
df.index = ["a","b","c","d","e"]

In [93]:
df

Unnamed: 0,var1,var2,var3
a,6,2,1
b,4,6,5
c,2,6,6
d,5,1,7
e,8,4,9


 ## Silme

In [107]:
df.drop("a", axis = 0)

Unnamed: 0,var1,var2,var3
b,4,6,5
c,2,6,6
d,5,1,7
e,8,4,9


In [95]:
df

Unnamed: 0,var1,var2,var3
a,6,2,1
b,4,6,5
c,2,6,6
d,5,1,7
e,8,4,9


In [108]:
df.drop("a", axis = 0, inplace = True)
# axis = satırı veya sütunu belirtir

In [109]:
df

Unnamed: 0,var1,var2,var3
b,4,6,5
c,2,6,6
d,5,1,7
e,8,4,9


In [110]:
# Fancy ile

In [111]:
l = ["c","e"]

In [112]:
 df.drop(l,axis = 0)

Unnamed: 0,var1,var2,var3
b,4,6,5
d,5,1,7


In [113]:
# Değişkenler İçin

In [114]:
df

Unnamed: 0,var1,var2,var3
b,4,6,5
c,2,6,6
d,5,1,7
e,8,4,9


In [115]:
"var1" in df

True

In [116]:
l = ["var1","var4","var2"]

In [117]:
for i in l:
    print(i in df)

True
False
True


In [118]:
df

Unnamed: 0,var1,var2,var3
b,4,6,5
c,2,6,6
d,5,1,7
e,8,4,9


In [119]:
df["var1"]

b    4
c    2
d    5
e    8
Name: var1, dtype: int64

In [120]:
 df["var4"] = df["var1"] / df["var2"]

In [121]:
df

Unnamed: 0,var1,var2,var3,var4
b,4,6,5,0.666667
c,2,6,6,0.333333
d,5,1,7,5.0
e,8,4,9,2.0


In [122]:
#Değişken Silme

In [123]:
df.drop("var4", axis =1)

Unnamed: 0,var1,var2,var3
b,4,6,5
c,2,6,6
d,5,1,7
e,8,4,9


In [124]:
df

Unnamed: 0,var1,var2,var3,var4
b,4,6,5,0.666667
c,2,6,6,0.333333
d,5,1,7,5.0
e,8,4,9,2.0


In [125]:
df.drop("var4", axis =1, inplace = True)

In [126]:
df

Unnamed: 0,var1,var2,var3
b,4,6,5
c,2,6,6
d,5,1,7
e,8,4,9


In [127]:
l = ["var1","var2"]

In [128]:
df.drop(l, axis =1)

Unnamed: 0,var3
b,5
c,6
d,7
e,9


In [129]:
df

Unnamed: 0,var1,var2,var3
b,4,6,5
c,2,6,6
d,5,1,7
e,8,4,9


# _Gözlem ve Değişken Seçimi: loc & iloc_

In [130]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30,size=(10,3))
df = pd.DataFrame(m , columns = ["var1","var2","var3"])
df

Unnamed: 0,var1,var2,var3
0,17,27,9
1,3,12,28
2,7,20,19
3,15,8,3
4,13,18,2
5,25,15,25
6,28,6,6
7,1,15,10
8,14,8,5
9,6,3,28


## loc: Tanımlandığı şekli ile seçim yapmak için kullanılır.

In [131]:
df.loc[0:3]

Unnamed: 0,var1,var2,var3
0,17,27,9
1,3,12,28
2,7,20,19
3,15,8,3


## iloc: Alışık olduğumuz indeksleme mantığıyla işlem yapar.

In [132]:
df.iloc[0:3]

Unnamed: 0,var1,var2,var3
0,17,27,9
1,3,12,28
2,7,20,19


In [133]:
df.iloc[0,0]

17

In [134]:
df.iloc[:2,:3]

Unnamed: 0,var1,var2,var3
0,17,27,9
1,3,12,28


In [137]:
df.loc[0:3,"var3"]

0     9
1    28
2    19
3     3
Name: var3, dtype: int64

* Mutlak bir şekilde değişken ismi veya gözlem birimleriyle yani indekslerin isimleriyle seçme işlemlerinda **loc** tercih edilir.
* İndeksli seçim yaklaşımında(klasik tip) **iloc** tercih edilir.

In [139]:
df.iloc[0:3,1:3] 

Unnamed: 0,var2,var3
0,27,9
1,12,28
2,20,19


In [140]:
df.iloc[0:3]["var3"]

0     9
1    28
2    19
Name: var3, dtype: int64

 # Koşullu Eleman İşlemleri 

In [141]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30,size=(10,3))
df = pd.DataFrame(m , columns = ["var1","var2","var3"])
df

Unnamed: 0,var1,var2,var3
0,17,14,28
1,10,29,7
2,19,16,27
3,28,16,6
4,10,8,8
5,27,15,2
6,15,20,24
7,14,17,2
8,1,2,1
9,16,16,14


In [142]:
df["var1"]

0    17
1    10
2    19
3    28
4    10
5    27
6    15
7    14
8     1
9    16
Name: var1, dtype: int64

In [143]:
df

Unnamed: 0,var1,var2,var3
0,17,14,28
1,10,29,7
2,19,16,27
3,28,16,6
4,10,8,8
5,27,15,2
6,15,20,24
7,14,17,2
8,1,2,1
9,16,16,14


In [144]:
df.var1

0    17
1    10
2    19
3    28
4    10
5    27
6    15
7    14
8     1
9    16
Name: var1, dtype: int64

In [147]:
df[df.var1 > 15]

0    17
2    19
3    28
5    27
9    16
Name: var1, dtype: int64

In [150]:
df[(df.var1 > 15) & (df.var3<10)]

Unnamed: 0,var1,var2,var3
3,28,16,6
5,27,15,2


In [157]:
df.loc[(df.var1 > 15) , ["var1","var2"]]
# df[(df.var1 > 15)][["var1","var2"]]

Unnamed: 0,var1,var2
0,17,14
2,19,16
3,28,16
5,27,15
9,16,16


# Birleştirme(Join) İşlemleri

In [160]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30,size=(5,3))
df1 = pd.DataFrame(m , columns = ["var1","var2","var3"])
df1

Unnamed: 0,var1,var2,var3
0,1,29,15
1,24,7,22
2,28,25,28
3,23,4,9
4,8,22,19


In [161]:
df2 = df1 + 99

In [162]:
df2

Unnamed: 0,var1,var2,var3
0,100,128,114
1,123,106,121
2,127,124,127
3,122,103,108
4,107,121,118


In [164]:
pd.concat([df1,df2])
# index bakımından hatalı

Unnamed: 0,var1,var2,var3
0,1,29,15
1,24,7,22
2,28,25,28
3,23,4,9
4,8,22,19
0,100,128,114
1,123,106,121
2,127,124,127
3,122,103,108
4,107,121,118


In [166]:
?pd.concat
#Çözüm yolu olup olmadığına bakıyoruz

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mobjs[0m[0;34m:[0m [0;34m'Iterable[NDFrame] | Mapping[Hashable, NDFrame]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mjoin[0m[0;34m=[0m[0;34m'outer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeys[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlevels[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnames[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverify_integrity[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msort[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcopy[0m[0;34m:[0m [0;34m'b

In [168]:
pd.concat([df1,df2],ignore_index= True)

Unnamed: 0,var1,var2,var3
0,1,29,15
1,24,7,22
2,28,25,28
3,23,4,9
4,8,22,19
5,100,128,114
6,123,106,121
7,127,124,127
8,122,103,108
9,107,121,118


In [169]:
df1.columns

Index(['var1', 'var2', 'var3'], dtype='object')

In [172]:
df2.columns = ["var1","var2","deg3"]

In [173]:
df2

Unnamed: 0,var1,var2,deg3
0,100,128,114
1,123,106,121
2,127,124,127
3,122,103,108
4,107,121,118


In [174]:
df1

Unnamed: 0,var1,var2,var3
0,1,29,15
1,24,7,22
2,28,25,28
3,23,4,9
4,8,22,19


In [175]:
pd.concat([df1,df2])

Unnamed: 0,var1,var2,var3,deg3
0,1,29,15.0,
1,24,7,22.0,
2,28,25,28.0,
3,23,4,9.0,
4,8,22,19.0,
0,100,128,,114.0
1,123,106,,121.0
2,127,124,,127.0
3,122,103,,108.0
4,107,121,,118.0


In [178]:
pd.concat([df1,df2], join = "inner", ignore_index = True)
# Kesişimlere göre birleştirir

Unnamed: 0,var1,var2
0,1,29
1,24,7
2,28,25
3,23,4
4,8,22
5,100,128
6,123,106
7,127,124
8,122,103
9,107,121


In [184]:
# pd.concat([df1, df2], join_axes = [df2.columns])

## İleri Birleştirme İşlemleri

In [4]:
import pandas as pd 

In [186]:
#birebir birlestirme

In [5]:
df1 = pd.DataFrame({'calisanlar':['Ali','Veli','Ayse','Fatma'],
                    'grup':['Muhasebe','Muhendislik','Muhendislik','İK']})
df1

Unnamed: 0,calisanlar,grup
0,Ali,Muhasebe
1,Veli,Muhendislik
2,Ayse,Muhendislik
3,Fatma,İK


In [17]:
df2 = pd.DataFrame({'calisanlar':['Ayse','Ali','Veli','Fatma'],
                    'ilk_giris':[2010,2009,2014,2019]})
df2

Unnamed: 0,calisanlar,ilk_giris
0,Ayse,2010
1,Ali,2009
2,Veli,2014
3,Fatma,2019


In [18]:
pd.merge(df1,df2)

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ali,Muhasebe,2009
1,Veli,Muhendislik,2014
2,Ayse,Muhendislik,2010
3,Fatma,İK,2019


In [14]:
pd.merge(df1,df2,on="calisanlar")

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ali,Muhasebe,2010
1,Veli,Muhendislik,2009
2,Ayse,Muhendislik,2014
3,Fatma,İK,2019


In [193]:
#coktan teke

In [19]:
df3 = pd.merge(df1,df2)
df3

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ali,Muhasebe,2009
1,Veli,Muhendislik,2014
2,Ayse,Muhendislik,2010
3,Fatma,İK,2019


In [23]:
df4 = pd.DataFrame({'grup':['Muhasebe', 'Muhendislik', 'İK'],
                    'mudur':['Caner', 'Mustafa', 'Berkcan']})
df4

Unnamed: 0,grup,mudur
0,Muhasebe,Caner
1,Muhendislik,Mustafa
2,İK,Berkcan


In [24]:
pd.merge(df3,df4)

Unnamed: 0,calisanlar,grup,ilk_giris,mudur
0,Ali,Muhasebe,2009,Caner
1,Veli,Muhendislik,2014,Mustafa
2,Ayse,Muhendislik,2010,Mustafa
3,Fatma,İK,2019,Berkcan


In [22]:
# coktan coka

In [25]:
df5 = pd.DataFrame({'grup':['Muhasebe','Muhasebe' ,
                            'Muhendislik','Muhendislik', 'İK', 'İK'],
                    'yetenekler':['matematik', 'excel', 'kodlama','linux', 'excel','yonetim']})
df5

Unnamed: 0,grup,yetenekler
0,Muhasebe,matematik
1,Muhasebe,excel
2,Muhendislik,kodlama
3,Muhendislik,linux
4,İK,excel
5,İK,yonetim


In [26]:
df1

Unnamed: 0,calisanlar,grup
0,Ali,Muhasebe
1,Veli,Muhendislik
2,Ayse,Muhendislik
3,Fatma,İK


In [27]:
pd.merge(df1,df5)

Unnamed: 0,calisanlar,grup,yetenekler
0,Ali,Muhasebe,matematik
1,Ali,Muhasebe,excel
2,Veli,Muhendislik,kodlama
3,Veli,Muhendislik,linux
4,Ayse,Muhendislik,kodlama
5,Ayse,Muhendislik,linux
6,Fatma,İK,excel
7,Fatma,İK,yonetim


# Toplulaştırma ve Gruplama (Aggregation & Grouping)

Basit Toplulaştırma Fonksiyonları: 
* count()
* first()
* last()
* mean()
* median()
* min()
* max()
* std()
* var()
* sum()

In [29]:
import seaborn as sns

In [30]:
?sns.load_dataset

[0;31mSignature:[0m [0msns[0m[0;34m.[0m[0mload_dataset[0m[0;34m([0m[0mname[0m[0;34m,[0m [0mcache[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mdata_home[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkws[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Load an example dataset from the online repository (requires internet).

This function provides quick access to a small number of example datasets
that are useful for documenting seaborn or generating reproducible examples
for bug reports. It is not necessary for normal usage.

Note that some of the datasets have a small amount of preprocessing applied
to define a proper ordering for categorical variables.

Use :func:`get_dataset_names` to see a list of available datasets.

Parameters
----------
name : str
    Name of the dataset (``{name}.csv`` on
    https://github.com/mwaskom/seaborn-data).
cache : boolean, optional
    If True, try to load from the local cache first, and save to the cache
 

In [36]:
df = sns.load_dataset("planets")
df

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [33]:
df.shape

(1035, 6)

In [34]:
df.ndim

2

In [35]:
df.mean()

  df.mean()


number               1.785507
orbital_period    2002.917596
mass                 2.638161
distance           264.069282
year              2009.070531
dtype: float64

In [38]:
df["mass"].std()

3.8186166509616046

In [41]:
df.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [42]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,1035.0,1.785507,1.240976,1.0,1.0,1.0,2.0,7.0
orbital_period,992.0,2002.917596,26014.728304,0.090706,5.44254,39.9795,526.005,730000.0
mass,513.0,2.638161,3.818617,0.0036,0.229,1.26,3.04,25.0
distance,808.0,264.069282,733.116493,1.35,32.56,55.25,178.5,8500.0
year,1035.0,2009.070531,3.972567,1989.0,2007.0,2010.0,2012.0,2014.0


In [45]:
df.dropna().describe().T
# NaN 'ları çıkartır

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,498.0,1.73494,1.17572,1.0,1.0,1.0,2.0,6.0
orbital_period,498.0,835.778671,1469.128259,1.3283,38.27225,357.0,999.6,17337.5
mass,498.0,2.50932,3.636274,0.0036,0.2125,1.245,2.8675,25.0
distance,498.0,52.068213,46.596041,1.35,24.4975,39.94,59.3325,354.0
year,498.0,2007.37751,4.167284,1989.0,2005.0,2009.0,2011.0,2014.0


## Gruplama İşlemleri

In [46]:
df = pd.DataFrame({'gruplar':['A','B','C','A','B','C'],
                  'veri': [10,11,52,23,43,55]}, columns=['gruplar','veri'])
df

Unnamed: 0,gruplar,veri
0,A,10
1,B,11
2,C,52
3,A,23
4,B,43
5,C,55


In [47]:
df.groupby("gruplar")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f8c98eb0640>

In [48]:
df.groupby("gruplar").mean()

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,16.5
B,27.0
C,53.5


In [49]:
df.groupby("gruplar").sum()

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,33
B,54
C,107


In [50]:
df = sns.load_dataset("planets")
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [56]:
df.groupby("method")["orbital_period"].mean()

method
Astrometry                          631.180000
Eclipse Timing Variations          4751.644444
Imaging                          118247.737500
Microlensing                       3153.571429
Orbital Brightness Modulation         0.709307
Pulsar Timing                      7343.021201
Pulsation Timing Variations        1170.000000
Radial Velocity                     823.354680
Transit                              21.102073
Transit Timing Variations            79.783500
Name: orbital_period, dtype: float64

In [57]:
df.groupby("method")["orbital_period"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,631.18,544.217663,246.36,438.77,631.18,823.59,1016.0
Eclipse Timing Variations,9.0,4751.644444,2499.130945,1916.25,2900.0,4343.5,5767.0,10220.0
Imaging,12.0,118247.7375,213978.177277,4639.15,8343.9,27500.0,94250.0,730000.0
Microlensing,7.0,3153.571429,1113.166333,1825.0,2375.0,3300.0,3550.0,5100.0
Orbital Brightness Modulation,3.0,0.709307,0.725493,0.240104,0.291496,0.342887,0.943908,1.544929
Pulsar Timing,5.0,7343.021201,16313.265573,0.090706,25.262,66.5419,98.2114,36525.0
Pulsation Timing Variations,1.0,1170.0,,1170.0,1170.0,1170.0,1170.0,1170.0
Radial Velocity,553.0,823.35468,1454.92621,0.73654,38.021,360.2,982.0,17337.5
Transit,397.0,21.102073,46.185893,0.355,3.16063,5.714932,16.1457,331.60059
Transit Timing Variations,3.0,79.7835,71.599884,22.3395,39.67525,57.011,108.5055,160.0


## İleri Toplulaştırma İşlemleri (Aggregate,filter,transform,apply)

In [66]:
import pandas as pd 
import numpy as np

In [60]:
df = pd.DataFrame({'gruplar':['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,253,333,262,111,969]},
                  columns =['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


## Aggregate Fonksiyonu

In [63]:
df.groupby("gruplar").mean()

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16.0,181.0
B,17.0,182.0
C,66.0,651.0


In [70]:
df.groupby("gruplar").aggregate(["min",np.median,max]).T

Unnamed: 0,gruplar,A,B,C
degisken1,min,10.0,11.0,33.0
degisken1,median,16.0,17.0,66.0
degisken1,max,22.0,23.0,99.0
degisken2,min,100.0,111.0,333.0
degisken2,median,181.0,182.0,651.0
degisken2,max,262.0,253.0,969.0


In [71]:
df.groupby("gruplar").aggregate({"degisken1":"min","degisken2":max})

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,10,262
B,11,253
C,33,969


## Filter Fonksiyonu

In [72]:
df = pd.DataFrame({'gruplar':['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,253,333,262,111,969]},
                  columns =['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [73]:
def filter_func(x):
    return x["degisken1"].std()>9

In [75]:
df.groupby("gruplar").std()

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,8.485281,114.551299
B,8.485281,100.409163
C,46.669048,449.719913


In [74]:
df.groupby("gruplar").filter(filter_func)

Unnamed: 0,gruplar,degisken1,degisken2
2,C,33,333
5,C,99,969


## Transform Fonksiyonu

In [76]:
df = pd.DataFrame({'gruplar':['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,253,333,262,111,969]},
                  columns =['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [77]:
df["degisken1"]*9

0     90
1    207
2    297
3    198
4     99
5    891
Name: degisken1, dtype: int64

In [81]:
df_a = df.iloc[:,1:3]

In [83]:
df_a.transform(lambda x: (x-x.mean())/x.std())

Unnamed: 0,degisken1,degisken2
0,-0.687871,-0.738461
1,-0.299074,-0.263736
2,0.0,-0.015514
3,-0.328982,-0.235811
4,-0.657963,-0.704331
5,1.97389,1.957853


## Apply Fonksiyonu

In [96]:
df = pd.DataFrame({'gruplar':['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,253,333,262,111,969]},
                  columns =['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


* df = pd.DataFrame({
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,253,333,262,111,969]},
                  columns =['degisken1','degisken2'])
* df

In [94]:
df.apply(np.sum) 

degisken1     198
degisken2    2028
dtype: int64

In [95]:
df.apply(np.mean)

degisken1     33.0
degisken2    338.0
dtype: float64

In [97]:
df.groupby("gruplar").apply(np.sum)

Unnamed: 0_level_0,gruplar,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,AA,32,362
B,BB,34,364
C,CC,132,1302


In [98]:
df.groupby("gruplar").apply(np.mean)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16.0,181.0
B,17.0,182.0
C,66.0,651.0


# Pivot Tablolar

* Veri setleri üzerinde bazı satır ve sütun işlemleri yaparak veri setini amaca uygun hale getirmek için kullanılan yapılardır.
* groupby işlemleriyle karıştırılabilmektedir. groupbay'ın çok boyutlu versiyonu olarak düşünülebilir.

In [99]:
import pandas as pd
import seaborn as sns

titanic = sns. load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [101]:
titanic.groupby("sex")[["survived"]].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [107]:
titanic.groupby(["sex","class"])[["survived"]].aggregate("mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,class,Unnamed: 2_level_1
female,First,0.968085
female,Second,0.921053
female,Third,0.5
male,First,0.368852
male,Second,0.157407
male,Third,0.135447


In [108]:
titanic.groupby(["sex","class"])[["survived"]].aggregate("mean").unstack()

Unnamed: 0_level_0,survived,survived,survived
class,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


## Pivot ile table

In [109]:
titanic.pivot_table("survived",index = "sex", columns = "class")

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [111]:
titanic.age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [113]:
age = pd.cut(titanic["age"],[0,18,90])
age.head(10)

0    (18.0, 90.0]
1    (18.0, 90.0]
2    (18.0, 90.0]
3    (18.0, 90.0]
4    (18.0, 90.0]
5             NaN
6    (18.0, 90.0]
7     (0.0, 18.0]
8    (18.0, 90.0]
9     (0.0, 18.0]
Name: age, dtype: category
Categories (2, interval[int64, right]): [(0, 18] < (18, 90]]

In [114]:
titanic.pivot_table("survived",["sex",age],"class")

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 90]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 90]",0.375,0.071429,0.133663


# Dış Kaynaklı Veri Okumak

In [115]:
import pandas as pd

In [120]:
?pd.read_csv

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfilepath_or_buffer[0m[0;34m:[0m [0;34m'FilePathOrBuffer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msep[0m[0;34m=[0m[0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdelimiter[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mheader[0m[0;34m=[0m[0;34m'infer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnames[0m[0;34m=[0m[0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex_col[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0musecols[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msqueeze[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprefix[0m[0;34m=[0m[0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmangle_dupe_cols[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[

In [118]:
#csv okuma
pd.read_csv("reading_data/ornekcsv.csv", sep = ";")

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
5,6,2,
6,56,11,6.0
7,7,12,7.0
8,56,21,7.0
9,346,2,8.0


In [119]:
#txt okuma
pd.read_csv("reading_data/duz_metin.txt")

Unnamed: 0,1 2
0,2 2
1,3 2
2,4 2
3,5 2
4,6 2
5,7 2
6,8 2
7,9 2
8,10 2


In [121]:
#excel okuma 
pd.read_excel("reading_data/ornekx.xlsx")

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
5,6,2,
6,56,11,6.0
7,7,12,7.0
8,56,21,7.0
9,346,2,8.0


In [122]:
df = pd.read_excel("reading_data/ornekx.xlsx")

In [123]:
type(df)

pandas.core.frame.DataFrame

In [124]:
df.head()

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0


In [125]:
df.columns = ("A","B","C")
df

Unnamed: 0,A,B,C
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
5,6,2,
6,56,11,6.0
7,7,12,7.0
8,56,21,7.0
9,346,2,8.0


In [126]:
# Github'tan alınan verilerin txt dosyasına atılması(raw şekilde) ve okunması
tips = pd.read_csv("reading_data/data.txt")

In [128]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# Bonus: Problem Çözme ve Doküman Okuma Kültürü

* %95 Yapısal Problemler ve Veri Ön İşleme 
* %5 Modelleme ve benzer işler