# Unit 3

## Unit 3.3 Checking for missing data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [48]:
df = sns.load_dataset('titanic')

In [49]:
df['deck'].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: deck, dtype: int64

In [50]:
df['deck'].value_counts(dropna=False)

NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: deck, dtype: int64

In [51]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [52]:
df.tail().isnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
887,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False
889,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
890,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


In [53]:
df.tail().notnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True
887,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
888,True,True,True,False,True,True,True,True,True,True,True,False,True,True,True
889,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
890,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True


In [54]:
df.tail().isnull().sum(axis='index')

survived       0
pclass         0
sex            0
age            1
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           3
embark_town    0
alive          0
alone          0
dtype: int64

In [55]:
df.tail().isnull().sum(axis='columns')

886    1
887    0
888    2
889    0
890    1
dtype: int64

In [56]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [57]:
# borrar todas las columnas con menos de 700 non-NA values
# mirar ayuda -> Require that many non-NA values.
df_thresh = df.dropna(axis=1, thresh=700)
df_thresh.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')

In [58]:
# limitar al subcjt 'age'
df_age = df.dropna(subset=['age'], how='any', axis='index')
df_age.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
885,0,3,female,39.0,0,5,29.125,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [59]:
df_age.shape

(714, 15)

In [60]:
len(df_age)

714

In [61]:
# replace missing values with mean:
mean_age = df['age'].mean(axis=0)
mean_age

29.69911764705882

In [62]:
df['age'].fillna(mean_age, inplace=True)

In [63]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,29.699118,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


### Replacing missing data with the mode

In [64]:
df['embark_town'][825:830]

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829            NaN
Name: embark_town, dtype: object

In [65]:
most_freq = df['embark_town'].value_counts(dropna=True).idxmax()
most_freq

'Southampton'

In [66]:
df['embark_town'].fillna(most_freq, inplace=True)

In [67]:
df['embark_town'][825:830]

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829    Southampton
Name: embark_town, dtype: object

## Unit 3.4 Checking and processing duplicate data

In [68]:
df = pd.DataFrame({'c1':['a','a','b','a','b'],
                   'c2':[1,1,1,2,2],
                   'c3':[1,1,2,2,2]})
df

Unnamed: 0,c1,c2,c3
0,a,1,1
1,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2


In [70]:
df_dup = df.duplicated()
df_dup

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [72]:
df2 = df.drop_duplicates()
df2

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2


In [73]:
df3 = df.drop_duplicates(subset=['c2','c3'])
df3

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2


## Unit 3.5 Data Feature Engineering

In [94]:
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,1970,USA,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,1970,USA,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,1970,USA,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,1970,USA,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,1970,USA,ford torino


In [95]:
mpg_to_kpl = 1.60934 / 3.78541

In [96]:
df['kpl'] = df['mpg'] * mpg_to_kpl
df.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,kpl
0,18.0,8,307.0,130,3504,12.0,1970,USA,chevrolet chevelle malibu,7.652571
1,15.0,8,350.0,165,3693,11.5,1970,USA,buick skylark 320,6.377143
2,18.0,8,318.0,150,3436,11.0,1970,USA,plymouth satellite,7.652571


In [97]:
df['kpl'] = df['kpl'].round(2)
df.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,kpl
0,18.0,8,307.0,130,3504,12.0,1970,USA,chevrolet chevelle malibu,7.65
1,15.0,8,350.0,165,3693,11.5,1970,USA,buick skylark 320,6.38
2,18.0,8,318.0,150,3436,11.0,1970,USA,plymouth satellite,7.65


In [98]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower        int64
weight            int64
acceleration    float64
year              int64
origin           object
name             object
kpl             float64
dtype: object

In [99]:
df['horsepower'].unique()

array([130, 165, 150, 140, 198, 220, 215, 225, 190, 170, 160,  95,  97,
        85,  88,  46,  87,  90, 113, 200, 210, 193, 100, 105, 175, 153,
       180, 110,  72,  86,  70,  76,  65,  69,  60,  80,  54, 208, 155,
       112,  92, 145, 137, 158, 167,  94, 107, 230,  49,  75,  91, 122,
        67,  83,  78,  52,  61,  93, 148, 129,  96,  71,  98, 115,  53,
        81,  79, 120, 152, 102, 108,  68,  58, 149,  89,  63,  48,  66,
       139, 103, 125, 133, 138, 135, 142,  77,  62, 132,  84,  64,  74,
       116,  82])

In [100]:
df['origin'].unique()

array(['USA', 'Japan', 'Germany'], dtype=object)

In [101]:
df['origin'] = df['origin'].astype('category')

In [102]:
df['origin'].dtypes

CategoricalDtype(categories=['Germany', 'Japan', 'USA'], ordered=False)

In [103]:
df['origin'].head()

0    USA
1    USA
2    USA
3    USA
4    USA
Name: origin, dtype: category
Categories (3, object): ['Germany', 'Japan', 'USA']

In [107]:
df['origin'] = df['origin'].astype('str')

In [108]:
df['origin'].head(3)

0    USA
1    USA
2    USA
Name: origin, dtype: object

In [109]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower        int64
weight            int64
acceleration    float64
year              int64
origin           object
name             object
kpl             float64
dtype: object

### Convert continuous variables into categorical discrete variables

In [129]:
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,1970,USA,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,1970,USA,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,1970,USA,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,1970,USA,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,1970,USA,ford torino


In [132]:
df['horsepower'][[0,2,5,12]] = '?'
df['horsepower'].head(10)

0      ?
1    165
2      ?
3    150
4    140
5      ?
6    220
7    215
8    225
9    190
Name: horsepower, dtype: object

In [133]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
year              int64
origin           object
name             object
dtype: object

In [134]:
df['horsepower'].replace('?', np.nan, inplace=True)

In [135]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
year              int64
origin           object
name             object
dtype: object

In [136]:
df.dropna(subset=['horsepower'], axis=0, inplace=True)

In [137]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
year              int64
origin           object
name             object
dtype: object

In [138]:
df['horsepower'] = df['horsepower'].astype('float')

In [139]:
count, bin_dividers = np.histogram(df['horsepower'], bins=3)

In [140]:
count, bin_dividers

(array([257, 100,  31]),
 array([ 46.        , 107.33333333, 168.66666667, 230.        ]))

In [141]:
bin_names = ['Low output', 'Normal output', 'High output']

In [143]:
df['hp bin'] = pd.cut(x=df['horsepower'],
                      bins=bin_dividers,
                      labels=bin_names,
                      include_lowest=True
                     )

In [144]:
df[['horsepower','hp bin']].head()

Unnamed: 0,horsepower,hp bin
1,165.0,Normal output
3,150.0,Normal output
4,140.0,Normal output
6,220.0,High output
7,215.0,High output
