# Basic Dataframes Functionalities

In [1]:
import pandas as pd
%matplotlib notebook

# sample IRIS datasets
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')

#  Dataframe info

In [2]:
iris.head(n=5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
iris.tail(n=5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


<div class="alert alert-block alert-info">
<b>Tip:</b> Interesting alternative to get a subset of a df is the function sample. Works both with the number rows and the fraction of rows to display.
</div>

In [4]:
iris.sample(frac=.1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
87,6.3,2.3,4.4,1.3,versicolor
5,5.4,3.9,1.7,0.4,setosa
105,7.6,3.0,6.6,2.1,virginica
23,5.1,3.3,1.7,0.5,setosa
71,6.1,2.8,4.0,1.3,versicolor
15,5.7,4.4,1.5,0.4,setosa
13,4.3,3.0,1.1,0.1,setosa
122,7.7,2.8,6.7,2.0,virginica
137,6.4,3.1,5.5,1.8,virginica
2,4.7,3.2,1.3,0.2,setosa


In [5]:
iris.info()
iris.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
# checks the memory usage of each columns
iris.memory_usage()

Index            128
sepal_length    1200
sepal_width     1200
petal_length    1200
petal_width     1200
species         1200
dtype: int64

In [7]:
# .count will count the number of non NA objects
iris.count()

# as a percentage
iris.count()/len(iris)

sepal_length    1.0
sepal_width     1.0
petal_length    1.0
petal_width     1.0
species         1.0
dtype: float64

In [8]:
# value_counts returns the number of rows for each unique value
iris['species'].value_counts()

virginica     50
setosa        50
versicolor    50
Name: species, dtype: int64

# Modify Dataframe

In [9]:
# add new col
iris['petal_length_sq'] = iris.petal_length**2
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,petal_length_sq
0,5.1,3.5,1.4,0.2,setosa,1.96
1,4.9,3.0,1.4,0.2,setosa,1.96
2,4.7,3.2,1.3,0.2,setosa,1.69
3,4.6,3.1,1.5,0.2,setosa,2.25
4,5.0,3.6,1.4,0.2,setosa,1.96


In [10]:
# drop col
iris.drop(['petal_length_sq'], axis=1, inplace=True)
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


# GroupBy

In [11]:
iris.groupby('species').mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


# Visualizations

In [14]:
pd.plotting.scatter_matrix(iris, figsize=(10,5))

<IPython.core.display.Javascript object>

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f69d7ec3e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f69d61a0bd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f69d6158ed0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f69d6111950>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f69d60cecd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f69d6085990>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f69d6045e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f69d5ffd9d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f69d6007550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f69d5fb9ed0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f69d5f2ed90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f69d5ee6a50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f69d5ea5dd0>,
        <matplotlib.axes._subplots.