In [1]:
import seaborn as sns

In [2]:
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df["deck"].head()

0    NaN
1      C
2    NaN
3      C
4    NaN
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [7]:
df["deck"].value_counts(dropna=False)

deck
NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: count, dtype: int64

In [8]:
df.head().isnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


In [9]:
df.head().notnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True


In [12]:
df.head().isnull().sum(axis=0)

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           3
embark_town    0
alive          0
alone          0
dtype: int64

In [13]:
# Drop columns with missing values with 500 or mor NaN values
df.dropna(axis=1, thresh=500).columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')

In [16]:
#count of lines in the dataset
len(df)


891

In [17]:
#Deletes all rows without age data
df.dropna(subset=["age"], how="any", axis=0)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [20]:
#calculates mean of age value avoiding NaN values
mean_age = df.age.mean(axis=0, skipna=True)
mean_age

29.69911764705882

In [21]:
#Replace NAN with the mean of age values
df["age"].fillna(mean_age, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(mean_age, inplace=True)


In [22]:
df.age

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: age, Length: 891, dtype: float64

In [33]:
#subset of rows using slice

df["embark_town"][825:830]

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829            NaN
Name: embark_town, dtype: object

In [24]:
#The NaN value of the embark_town column is replaced with the value that appears the most among the
#boarding cities.
most_freq = df["embark_town"].value_counts().idxmax()
most_freq

'Southampton'

In [25]:
df["embark_town"].fillna(most_freq)[825:830]

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829    Southampton
Name: embark_town, dtype: object

In [27]:
df["embark_town"].fillna(most_freq, inplace=True)
df[825:830]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
825,0,3,male,29.699118,0,0,6.95,Q,Third,man,True,,Queenstown,no,True
826,0,3,male,29.699118,0,0,56.4958,S,Third,man,True,,Southampton,no,True
827,1,2,male,1.0,0,2,37.0042,C,Second,child,False,,Cherbourg,yes,False
828,1,3,male,29.699118,0,0,7.75,Q,Third,man,True,,Queenstown,yes,True
829,1,1,female,62.0,0,0,80.0,,First,woman,False,B,Southampton,yes,True


In [28]:
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [29]:
df["embark_town"].fillna(method="bfill")[825:831]

  df["embark_town"].fillna(method="bfill")[825:831]


825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829      Cherbourg
830      Cherbourg
Name: embark_town, dtype: object

In [30]:
import pandas as pd
df = pd.DataFrame({"c1": ["a","a", "b", "a", "b"], 
                   "c2": [1,1,1,2,2],
                   "c3": [1,1,2,2,2]})

df

Unnamed: 0,c1,c2,c3
0,a,1,1
1,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2


In [31]:
df.duplicated()

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [32]:
df.c2.duplicated()

0    False
1     True
2     True
3    False
4     True
Name: c2, dtype: bool

In [33]:
df.drop_duplicates()

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2


In [34]:
df.drop_duplicates(subset=["c2", "c3"])

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2


In [35]:
df = pd.read_csv("auto-mpg.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'auto-mpg.csv'

In [None]:
mpg_kpl = 1.60934 / 3.78541

In [None]:
df["kpl"] = df["mpg"] * mpg_kpl
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year,kpl
0,18.0,8,307.0,130.0,3504,12.0,70,7.652571
1,15.0,8,350.0,165.0,3693,11.5,70,6.377143
2,18.0,8,318.0,150.0,3436,11.0,70,7.652571
3,16.0,8,304.0,150.0,3433,12.0,70,6.802286
4,17.0,8,302.0,140.0,3449,10.5,70,7.227428
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,11.478857
394,44.0,4,97.0,52.0,2130,24.6,82,18.706285
395,32.0,4,135.0,84.0,2295,11.6,82,13.604571
396,28.0,4,120.0,79.0,2625,18.6,82,11.904000


In [87]:
df["kpl"] = df["kpl"].round(2)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year,kpl
0,18.0,8,307.0,130.0,3504,12.0,70,7.65
1,15.0,8,350.0,165.0,3693,11.5,70,6.38
2,18.0,8,318.0,150.0,3436,11.0,70,7.65
3,16.0,8,304.0,150.0,3433,12.0,70,6.80
4,17.0,8,302.0,140.0,3449,10.5,70,7.23
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,11.48
394,44.0,4,97.0,52.0,2130,24.6,82,18.71
395,32.0,4,135.0,84.0,2295,11.6,82,13.60
396,28.0,4,120.0,79.0,2625,18.6,82,11.90


In [88]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model-year        int64
kpl             float64
dtype: object

In [89]:
df["horsepower"].unique()

array([130., 165., 150., 140., 198., 220., 215., 225., 190., 170., 160.,
        95.,  97.,  85.,  88.,  46.,  87.,  90., 113., 200., 210., 193.,
        nan, 100., 105., 175., 153., 180., 110.,  72.,  86.,  70.,  76.,
        65.,  69.,  60.,  80.,  54., 208., 155., 112.,  92., 145., 137.,
       158., 167.,  94., 107., 230.,  49.,  75.,  91., 122.,  67.,  83.,
        78.,  52.,  61.,  93., 148., 129.,  96.,  71.,  98., 115.,  53.,
        81.,  79., 120., 152., 102., 108.,  68.,  58., 149.,  89.,  63.,
        48.,  66., 139., 103., 125., 133., 138., 135., 142.,  77.,  62.,
       132.,  84.,  64.,  74., 116.,  82.])

In [90]:
df["model-year"].sample(3)

41     71
225    77
200    76
Name: model-year, dtype: int64

In [92]:
df["model-year"] = df["model-year"].astype("category")
df["model-year"]

0      70
1      70
2      70
3      70
4      70
       ..
393    82
394    82
395    82
396    82
397    82
Name: model-year, Length: 398, dtype: category
Categories (13, int64): [70, 71, 72, 73, ..., 79, 80, 81, 82]

In [93]:
import numpy as np
df["horsepower"] = df["horsepower"].replace("?", np.nan)

In [94]:
len(df)

398

In [98]:
df = df.dropna(subset=["horsepower"], axis=0)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year,kpl
0,18.0,8,307.0,130.0,3504,12.0,70,7.65
1,15.0,8,350.0,165.0,3693,11.5,70,6.38
2,18.0,8,318.0,150.0,3436,11.0,70,7.65
3,16.0,8,304.0,150.0,3433,12.0,70,6.80
4,17.0,8,302.0,140.0,3449,10.5,70,7.23
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,11.48
394,44.0,4,97.0,52.0,2130,24.6,82,18.71
395,32.0,4,135.0,84.0,2295,11.6,82,13.60
396,28.0,4,120.0,79.0,2625,18.6,82,11.90


In [99]:
df.horsepower.dtype

dtype('float64')

In [100]:
count, bin_dividers = np.histogram(df.horsepower, bins=3)
print(count)
print(bin_dividers)

[261 103  32]
[ 46.         107.33333333 168.66666667 230.        ]


In [101]:
bin_names = ["lower output", "Normal output", "High output"]
df["hp_bin"] = pd.cut(x=df.horsepower,
                     bins=bin_dividers,
                     labels=bin_names,
                     include_lowest=True)
df[["hp_bin", "horsepower"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["hp_bin"] = pd.cut(x=df.horsepower,


Unnamed: 0,hp_bin,horsepower
0,Normal output,130.0
1,Normal output,165.0
2,Normal output,150.0
3,Normal output,150.0
4,Normal output,140.0
...,...,...
393,lower output,86.0
394,lower output,52.0
395,lower output,84.0
396,lower output,79.0
