###*7.1 Handling Missing data*


In [7]:

import pandas as pd
import numpy as np
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [5]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

#Filtering Out Missing Data


In [8]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],         [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [12]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [None]:
data[4] = np.nan
data.dropna(axis="columns", how="all")

In [16]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [24]:
df=pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

In [25]:

df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.014274,,-1.752013
3,-0.434773,,-0.192125
4,-0.190176,0.816432,0.181209
5,-0.797411,0.340507,0.994874
6,0.46339,0.813269,-1.234597


#Filling In Missing Data

In [26]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.313199,0.5,0.0
1,-0.743701,0.5,0.0
2,0.014274,0.5,-1.752013
3,-0.434773,0.5,-0.192125
4,-0.190176,0.816432,0.181209
5,-0.797411,0.340507,0.994874
6,0.46339,0.813269,-1.234597


In [27]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

#7.2 Data Transformation

In [28]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],                      "k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [29]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [30]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [33]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [38]:
data.drop_duplicates(subset=["k1"] )  #,keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [39]:
data.drop_duplicates(["k1", "k2"], keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


#Transforming Data Using a Function or Mapping


In [42]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon","pastrami", "corned beef", "bacon","pastrami", "honey ham", "nova lox"],"bounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,bounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [51]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [52]:
data["animal"] = data["food"].map(meat_to_animal)
data

Unnamed: 0,food,bounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [53]:
def get_animal(x):
     return meat_to_animal[x]

get_animal("bacon")

'pig'

In [54]:
data["food"].map(get_animal)
data

Unnamed: 0,food,bounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [57]:
data["food"].map(lambda x: meat_to_animal[x] if x in data["food"].values else np.nan)



Unnamed: 0,food,bounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


#Replacing Values


In [59]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data


0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [60]:

data.replace(-999,np.nan, inplace=True)
#data.replace([-999, -1000], np.nan, inplace=True)
#data.replace([-999, -1000], [np.nan, 0])
data

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [62]:
np.arange(12).reshape((3, 4))

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

#Renaming Axis Indexes

In [65]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),                     index=["Ohio", "Colorado", "New York"],                 columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [70]:
def transform(x):
        return x[:4].upper()

data.index.map(transform)


Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [71]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [73]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [74]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [75]:
data.rename(index={"OHIO": "INDIANA"},columns={"three": "peekaboo"})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


#Discretization and Binning

In [81]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32,99]
bins = [18, 25, 35, 60, 100]
age_categories = pd.cut(ages, bins)
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (60, 100], (35, 60], (35, 60], (25, 35], (60, 100]]
Length: 13
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [82]:
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1, 3], dtype=int8)

In [83]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [84]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [85]:
pd.value_counts(age_categories)

  pd.value_counts(age_categories)


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    2
Name: count, dtype: int64

In [86]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages,bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult', 'Senior']
Length: 13
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [89]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [90]:
data = np.random.uniform(size=20)
pd.cut(data, 4, precision=2)

[(0.53, 0.74], (0.74, 0.96], (0.53, 0.74], (0.091, 0.31], (0.74, 0.96], ..., (0.53, 0.74], (0.31, 0.53], (0.53, 0.74], (0.53, 0.74], (0.74, 0.96]]
Length: 20
Categories (4, interval[float64, right]): [(0.091, 0.31] < (0.31, 0.53] < (0.53, 0.74] < (0.74, 0.96]]

In [None]:
data = np.random.binomial(n=10, p=0.5, size=20)  #Şimdi Python bu işi 20 kere yapıyor, yani: 20 defa 10 kez yazı-tura atıyor, Her defasında kaç kez yazı geldiğini hesaplıyor,Sonra bu 20 sonucu bir listeye koyuyor.
data

array([3, 5, 2, 2, 4, 6, 5, 6, 2, 7, 3, 4, 7, 5, 7, 4, 5, 3, 4, 6])

In [97]:
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 5, precision=2)
quartiles.value_counts()

(-3.4099999999999997, -0.85]    200
(-0.85, -0.27]                  200
(-0.27, 0.2]                    200
(0.2, 0.76]                     200
(0.76, 2.89]                    200
Name: count, dtype: int64

In [98]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-3.403, -1.285]      100
(-1.285, -0.00193]    400
(-0.00193, 1.233]     400
(1.233, 2.889]        100
Name: count, dtype: int64

#Detecting and Filtering Outliers

In [126]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.018394,0.003139,-0.022626,-0.026014
std,1.007372,0.979381,1.042563,1.020782
min,-2.545389,-2.99895,-3.509685,-3.066265
25%,-0.66046,-0.673762,-0.737143,-0.72834
50%,-0.037579,0.031065,-0.012638,-0.014867
75%,0.618151,0.687635,0.690241,0.671961
max,3.514706,3.082849,3.340182,3.540603


In [103]:
col = data[2]
col

0      0.687836
1     -0.276387
2      0.262049
3     -0.105642
4      1.554655
         ...   
995    0.622438
996    2.438909
997   -1.731981
998    0.722587
999   -1.102717
Name: 2, Length: 1000, dtype: float64

In [104]:
col[col.abs() > 3]

311    3.400963
719   -3.125060
895   -3.115676
902   -3.529216
984    3.689451
Name: 2, dtype: float64

In [105]:
data[(data.abs() > 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
91,3.165173,-0.049402,0.913125,-0.327168
132,3.22114,0.947795,-0.980991,0.298209
311,0.9636,1.000764,3.400963,0.199294
457,1.293987,3.718479,-1.3125,-1.124196
641,-3.140186,1.887786,0.641724,-0.081477
719,0.929384,-0.237802,-3.12506,0.312314
895,0.419427,0.562882,-3.115676,0.95018
902,-0.664451,0.446441,-3.529216,0.962845
959,0.755918,-3.573104,-0.140487,-0.368183
984,-0.519559,-0.482641,3.689451,0.101264


In [106]:
data[data.abs() > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.006711,0.046348,0.023649,0.009458
std,0.990648,0.987714,1.023859,0.998805
min,-3.0,-3.0,-3.0,-2.888224
25%,-0.641634,-0.602559,-0.710785,-0.681129
50%,-0.040378,0.102531,0.047637,0.017708
75%,0.634159,0.737649,0.729946,0.738425
max,3.0,3.0,3.0,2.923404


In [None]:
a=np.sign(data) # ya 1 edir yada -1 eder, 0 ise 0 eder
a

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0
2,1.0,1.0,1.0,1.0
3,-1.0,-1.0,-1.0,1.0
4,-1.0,-1.0,1.0,1.0
...,...,...,...,...
995,-1.0,1.0,1.0,-1.0
996,-1.0,1.0,1.0,1.0
997,1.0,1.0,-1.0,-1.0
998,1.0,-1.0,1.0,1.0


In [None]:
df = pd.DataFrame(np.arange(5 * 7).reshape(3,4))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [139]:
sampler = np.random.permutation(5)
sampler

array([0, 4, 3, 2, 1])

In [140]:
df.take(sampler)
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [141]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
3,21,22,23,24,25,26,27
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13


#Computing Indicator/Dummy Variables

In [142]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],"data1": range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [143]:
pd.get_dummies(df["key"], dtype=float)

Unnamed: 0,a,b,c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [145]:
dummies = pd.get_dummies(df["key"], prefix="key", dtype=float)
df_with_dummy = df[["data1"]].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0.0,1.0,0.0
1,1,0.0,1.0,0.0
2,2,1.0,0.0,0.0
3,3,0.0,0.0,1.0
4,4,1.0,0.0,0.0
5,5,0.0,1.0,0.0
