# Unit 3

## Unit 3.3 Checking for missing data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df['deck'].value_counts()

deck
C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: count, dtype: int64

In [4]:
df['deck'].value_counts(dropna=False)

deck
NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: count, dtype: int64

In [5]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [6]:
df.tail().isnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
887,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False
889,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
890,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


In [7]:
df.tail().notnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True
887,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
888,True,True,True,False,True,True,True,True,True,True,True,False,True,True,True
889,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
890,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True


In [8]:
df.tail().isnull().sum(axis='index')

survived       0
pclass         0
sex            0
age            1
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           3
embark_town    0
alive          0
alone          0
dtype: int64

In [9]:
df.tail().isnull().sum(axis='columns')

886    1
887    0
888    2
889    0
890    1
dtype: int64

In [10]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [11]:
# borrar todas las columnas con menos de 700 non-NA values
# mirar ayuda -> Require that many non-NA values.
df_thresh = df.dropna(axis=1, thresh=700)
df_thresh.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')

In [12]:
# limitar al subcjt 'age'
df_age = df.dropna(subset=['age'], how='any', axis='index')
df_age.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
885,0,3,female,39.0,0,5,29.125,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [13]:
df_age.shape

(714, 15)

In [14]:
len(df_age)

714

In [15]:
# replace missing values with mean:
mean_age = df['age'].mean(axis=0)
mean_age

29.69911764705882

In [16]:
df['age'].fillna(mean_age, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(mean_age, inplace=True)


In [17]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,29.699118,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


### Replacing missing data with the mode

In [18]:
df['embark_town'][825:830]

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829            NaN
Name: embark_town, dtype: object

In [19]:
most_freq = df['embark_town'].value_counts(dropna=True).idxmax()
most_freq

'Southampton'

In [20]:
df['embark_town'].fillna(most_freq, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embark_town'].fillna(most_freq, inplace=True)


In [21]:
df['embark_town'][825:830]

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829    Southampton
Name: embark_town, dtype: object

## Unit 3.4 Checking and processing duplicate data

In [22]:
df = pd.DataFrame({'c1':['a','a','b','a','b'],
                   'c2':[1,1,1,2,2],
                   'c3':[1,1,2,2,2]})
df

Unnamed: 0,c1,c2,c3
0,a,1,1
1,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2


In [23]:
df_dup = df.duplicated()
df_dup

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [24]:
df2 = df.drop_duplicates()
df2

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2


In [25]:
df3 = df.drop_duplicates(subset=['c2','c3'])
df3

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2


## Unit 3.5 Data Feature Engineering

In [39]:
#separar un string por un caracter concreto
"23-8-78".split('-')

['23', '8', '78']

In [41]:
#limpiar una cadena de texto de espacios en blanco a los lados
"   23-8-78   ".strip()

'23-8-78'

In [42]:
#Crear un dataframe de fechas
# Crear un rango de fechas aleatorias
start_date = '2022-01-01'
end_date = '2022-01-05'
num_dates = 5

random_dates = pd.date_range(start=start_date, end=end_date, periods=num_dates)

# Crear el DataFrame
df_dates = pd.DataFrame({'Fecha': random_dates})

# Imprimir el DataFrame
print(df_dates)


       Fecha
0 2022-01-01
1 2022-01-02
2 2022-01-03
3 2022-01-04
4 2022-01-05


In [53]:
def extraer_dia(x):
    return x.split('-')[0]

def extraer_mes(x):
    return x.split('-')[1]

def extraer_año(x):
    return x.split('-')[2]


In [50]:
df_dates['Fecha'] = df_dates['Fecha'].dt.strftime('%Y-%m-%d')


In [54]:

df_dates['dia'] = df_dates['Fecha'].apply(extraer_dia)
df_dates['mes'] = df_dates['Fecha'].apply(extraer_mes)
df_dates['año'] = df_dates['Fecha'].apply(extraer_año)

In [58]:
df_dates.fecha.dt.day_name()

AttributeError: 'DataFrame' object has no attribute 'fecha'

In [59]:
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70


In [29]:
mpg_to_kpl = 1.60934 / 3.78541

In [30]:
df['kpl'] = df['mpg'] * mpg_to_kpl
df.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year,kpl
0,18.0,8,307.0,130.0,3504,12.0,70,7.652571
1,15.0,8,350.0,165.0,3693,11.5,70,6.377143
2,18.0,8,318.0,150.0,3436,11.0,70,7.652571


In [35]:
df['kpl'] = df['kpl'].round(2)
df.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year,kpl
0,18.0,8,307.0,130.0,3504,12.0,70,7.65
1,15.0,8,350.0,165.0,3693,11.5,70,6.38
2,18.0,8,318.0,150.0,3436,11.0,70,7.65


In [36]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model-year        int64
kpl             float64
dtype: object

In [37]:
df['horsepower'].unique()

array([130., 165., 150., 140., 198., 220., 215., 225., 190., 170., 160.,
        95.,  97.,  85.,  88.,  46.,  87.,  90., 113., 200., 210., 193.,
        nan, 100., 105., 175., 153., 180., 110.,  72.,  86.,  70.,  76.,
        65.,  69.,  60.,  80.,  54., 208., 155., 112.,  92., 145., 137.,
       158., 167.,  94., 107., 230.,  49.,  75.,  91., 122.,  67.,  83.,
        78.,  52.,  61.,  93., 148., 129.,  96.,  71.,  98., 115.,  53.,
        81.,  79., 120., 152., 102., 108.,  68.,  58., 149.,  89.,  63.,
        48.,  66., 139., 103., 125., 133., 138., 135., 142.,  77.,  62.,
       132.,  84.,  64.,  74., 116.,  82.])

In [38]:
df['origin'].unique()

KeyError: 'origin'

In [101]:
df['origin'] = df['origin'].astype('category')

In [102]:
df['origin'].dtypes

CategoricalDtype(categories=['Germany', 'Japan', 'USA'], ordered=False)

In [103]:
df['origin'].head()

0    USA
1    USA
2    USA
3    USA
4    USA
Name: origin, dtype: category
Categories (3, object): ['Germany', 'Japan', 'USA']

In [107]:
df['origin'] = df['origin'].astype('str')

In [108]:
df['origin'].head(3)

0    USA
1    USA
2    USA
Name: origin, dtype: object

In [109]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower        int64
weight            int64
acceleration    float64
year              int64
origin           object
name             object
kpl             float64
dtype: object

### Convert continuous variables into categorical discrete variables

In [60]:
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70


In [61]:
df['horsepower'][[0,2,5,12]] = '?'
df['horsepower'].head(10)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['horsepower'][[0,2,5,12]] = '?'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['horsepower'][[0,2,5,12]]

0        ?
1    165.0
2        ?
3    150.0
4    140.0
5        ?
6    220.0
7    215.0
8    225.0
9    190.0
Name: horsepower, dtype: object

In [62]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model-year        int64
dtype: object

In [63]:
df['horsepower'].replace('?', np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].replace('?', np.nan, inplace=True)
  df['horsepower'].replace('?', np.nan, inplace=True)


In [64]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model-year        int64
dtype: object

In [65]:
df.dropna(subset=['horsepower'], axis=0, inplace=True)

In [66]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model-year        int64
dtype: object

In [67]:
df['horsepower'] = df['horsepower'].astype('float')

In [68]:
count, bin_dividers = np.histogram(df['horsepower'], bins=3)

In [69]:
count, bin_dividers

(array([261, 100,  31]),
 array([ 46.        , 107.33333333, 168.66666667, 230.        ]))

In [70]:
bin_names = ['Low output', 'Normal output', 'High output']

In [71]:
df['hp bin'] = pd.cut(x=df['horsepower'],
                      bins=bin_dividers,
                      labels=bin_names,
                      include_lowest=True
                     )

In [79]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year,hp bin
1,15.0,8,350.0,165.0,3693,11.5,70,Normal output
3,16.0,8,304.0,150.0,3433,12.0,70,Normal output
4,17.0,8,302.0,140.0,3449,10.5,70,Normal output
6,14.0,8,454.0,220.0,4354,9.0,70,High output
7,14.0,8,440.0,215.0,4312,8.5,70,High output


In [73]:
df['hp bin']

1      Normal output
3      Normal output
4      Normal output
6        High output
7        High output
           ...      
393       Low output
394       Low output
395       Low output
396       Low output
397       Low output
Name: hp bin, Length: 392, dtype: category
Categories (3, object): ['Low output' < 'Normal output' < 'High output']

In [78]:
horsepower_dummies = pd.get_dummies(df['hp bin'])
horsepower_dummies

Unnamed: 0,Low output,Normal output,High output
1,False,True,False
3,False,True,False
4,False,True,False
6,False,False,True
7,False,False,True
...,...,...,...
393,True,False,False
394,True,False,False
395,True,False,False
396,True,False,False


In [80]:
#normalización de valores
df['horsepower_norm']= df['horsepower']/abs(df['horsepower'].max())

In [81]:
df['horsepower_norm']


1      0.717391
3      0.652174
4      0.608696
6      0.956522
7      0.934783
         ...   
393    0.373913
394    0.226087
395    0.365217
396    0.343478
397    0.356522
Name: horsepower_norm, Length: 392, dtype: float64

In [83]:
min_x = df.horsepower - df.horsepower.min()
min_max = df.horsepower.max() - df.horsepower.min()
df.horsepower = min_x / min_max

print(df.horsepower.head())
print('\n')
print(df.horsepower.describe())

1   NaN
3   NaN
4   NaN
6   NaN
7   NaN
Name: horsepower, dtype: float64


count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: horsepower, dtype: float64


  min_max = df.horsepower.max() - df.horsepower.min()
