## Importing pandas

#### 1.import pandas under the alias pd.

In [2]:
import pandas as pd

#### 2. Print the version of pandas that has been imported.

In [7]:
print(pd.__version__)

1.2.4


#### 3. Print out all the version information of the libraries that are required by the pandas library.

In [5]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : 2cb96529396d93b46abab7bbc73a208e708c642e
python           : 3.8.8.final.0
python-bits      : 64
OS               : Darwin
OS-release       : 17.7.0
Version          : Darwin Kernel Version 17.7.0: Thu Jun 18 21:21:34 PDT 2020; root:xnu-4570.71.82.5~1/RELEASE_X86_64
machine          : x86_64
processor        : i386
byteorder        : little
LC_ALL           : en_US.UTF-8
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 1.2.4
numpy            : 1.20.1
pytz             : 2021.1
dateutil         : 2.8.1
pip              : 21.0.1
setuptools       : 52.0.0.post20210125
Cython           : 0.29.23
pytest           : 6.2.3
hypothesis       : None
sphinx           : 4.0.1
blosc            : None
feather          : None
xlsxwriter       : 1.3.8
lxml.etree       : 4.6.3
html5lib         : 1.1
pymysql          : None
psycopg2         : None
jinja2           : 2.11.3
IPython          : 7.22.0
pandas_datareader

## DataFrame basics

In [9]:
import numpy as np

#### 4. Create a DataFrame df from this dictionary data which has the index labels.

In [10]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df=pd.DataFrame(data,index=labels)
print(df)

  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no
d    dog  NaN       3      yes
e    dog  5.0       2       no
f    cat  2.0       3       no
g  snake  4.5       1       no
h    cat  NaN       1      yes
i    dog  7.0       2       no
j    dog  3.0       1       no


#### 5. Display a summary of the basic information about this DataFrame and its data 

In [11]:
df=pd.DataFrame(data,index=labels)
print(df.describe())

            age     visits
count  8.000000  10.000000
mean   3.437500   1.900000
std    2.007797   0.875595
min    0.500000   1.000000
25%    2.375000   1.000000
50%    3.000000   2.000000
75%    4.625000   2.750000
max    7.000000   3.000000


#### 6.Return the first 3 rows of the DataFrame df.

In [107]:
print(df.head(3))

  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no


#### 7.Select just the 'animal' and 'age' columns from the DataFrame df.

In [15]:
df1=pd.DataFrame(data,index=labels,columns=('animal','age'))
print(df1)

  animal  age
a    cat  2.5
b    cat  3.0
c  snake  0.5
d    dog  NaN
e    dog  5.0
f    cat  2.0
g  snake  4.5
h    cat  NaN
i    dog  7.0
j    dog  3.0


#### 8. Select the data in rows [3, 4, 8] and in columns ['animal', 'age'].

In [106]:
df1=pd.DataFrame(data,index=labels,columns=('animal','age'))
print(df1.loc[['c','d','h']])


  animal  age
c  snake  0.5
d    dog  NaN
h    cat  NaN


#### 9. Select only the rows where the number of visits is greater than 3.

In [37]:
print(df[df['visits'] > 3])

Empty DataFrame
Columns: [animal, age, visits, priority]
Index: []


#### 10.Select the rows where the age is missing, i.e. it is NaN.

In [43]:
print(df[df['age'].isnull()])

  animal  age  visits priority
d    dog  NaN       3      yes
h    cat  NaN       1      yes


#### 11.Select the rows where the animal is a cat and the age is less than 3.

In [50]:
an=['cat']
print(df[(df['age'] < 3) &
          df['animal'].isin(an)])

  animal  age  visits priority
a    cat  2.5       1      yes
f    cat  2.0       3       no


#### 12. Select the rows the age is between 2 and 4 (inclusive).

In [105]:
print(df[(df['age'] >=2) &
          (df['age']<=4)])

  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
j    dog  3.0       1       no


#### 13. Change the age in row 'f' to 1.5.

In [104]:
df.loc['f', 'age'] = 11.5
print(df)

  animal   age  visits priority
a    cat   2.5       1      yes
b    cat   3.0       3      yes
c  snake   0.5       2       no
d    dog   NaN       3      yes
e    dog   5.0       2       no
f    cat  11.5       3       no
g  snake   4.5       1       no
h    cat   NaN       1      yes
i    dog   7.0       2       no
j    dog   3.0       1       no


#### 14. Calculate the sum of all visits in df (i.e. find the total number of visits).

In [103]:
df=pd.DataFrame(data,index=labels)
print(df['visits'].sum())    

19


#### 15. Calculate the mean age for each different animal in df.

In [102]:
animal_groups = df.groupby('animal')
print(animal_groups['age'].mean())

animal
cat      2.5
dog      5.0
snake    2.5
Name: age, dtype: float64


#### 16. Append a new row 'k' to df with your choice of values for each column. Then delete that row to return the original DataFrame.

In [101]:
df.loc['k'] = [ 'pig', 2,3,'no']
df=df.drop('k')
print(df)


  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no
d    dog  NaN       3      yes
e    dog  5.0       2       no
f    cat  2.0       3       no
g  snake  4.5       1       no
h    cat  NaN       1      yes
i    dog  7.0       2       no
j    dog  3.0       1       no


#### 17. Count the number of each type of animal in df.

In [100]:
print(df['animal'].value_counts())

cat      4
dog      4
snake    2
Name: animal, dtype: int64


#### 18. Sort df first by the values in the 'age' in decending order, then by the value in the 'visits' column in ascending order (so row i should be first, and row d should be last).

In [99]:
df=pd.DataFrame(data,index=labels)
print(df.sort_values(by=['age', 'visits'], ascending=[False, True]))


  animal  age  visits priority
i    dog  7.0       2       no
e    dog  5.0       2       no
g  snake  4.5       1       no
j    dog  3.0       1       no
b    cat  3.0       3      yes
a    cat  2.5       1      yes
f    cat  2.0       3       no
c  snake  0.5       2       no
h    cat  NaN       1      yes
d    dog  NaN       3      yes


#### 19. The 'priority' column contains the values 'yes' and 'no'. Replace this column with a column of boolean values: 'yes' should be True and 'no' should be False.

In [98]:
df['priority'] = df['priority'].map({'yes': True, 'no': False})
print(df)

   animal  age  visits  priority
a     cat  2.5       1      True
b     cat  3.0       3      True
c  python  0.5       2     False
d     dog  NaN       3      True
e     dog  5.0       2     False
f     cat  2.0       3     False
g  python  4.5       1     False
h     cat  NaN       1      True
i     dog  7.0       2     False
j     dog  3.0       1     False


#### 20. In the 'animal' column, change the 'snake' entries to 'python'.

In [97]:
df['animal'] = df['animal'].replace('snake', 'python')
print(df)

   animal  age  visits priority
a     cat  2.5       1      yes
b     cat  3.0       3      yes
c  python  0.5       2       no
d     dog  NaN       3      yes
e     dog  5.0       2       no
f     cat  2.0       3       no
g  python  4.5       1       no
h     cat  NaN       1      yes
i     dog  7.0       2       no
j     dog  3.0       1       no


#### 21. For each animal type and each number of visits, find the mean age. In other words, each row is an animal, each column is a number of visits and the values are the mean ages

In [111]:
print(df.pivot_table(index='animal', columns='visits', values='age', aggfunc='mean'))

visits    1    2     3
animal                
cat     2.5  NaN  7.25
dog     3.0  6.0   NaN
snake   4.5  0.5   NaN


####  22 .You have a DataFrame df with a column 'A' of integers. For example:

df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
How do you filter out rows which contain the same integer as the row immediately above?

You should be left with a column containing the following values:

1, 2, 3, 4, 5, 6, 7

In [113]:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
print(df.loc[df['A'].shift() != df['A']])

   A
0  1
1  2
3  3
4  4
5  5
8  6
9  7


#### 23. Given a DataFrame of numeric values, say

df = pd.DataFrame(np.random.random(size=(5, 3))) # a 5x3 frame of float values
how do you subtract the row mean from each element in the row?

In [115]:
df = pd.DataFrame(np.random.random(size=(5, 3)))
print(df.sub(df.mean(axis=1), axis=0))

          0         1         2
0  0.232254 -0.412497  0.180242
1 -0.151250  0.524012 -0.372762
2 -0.077422 -0.000668  0.078090
3  0.345897 -0.038030 -0.307867
4  0.106828 -0.052284 -0.054544


#### 24. Suppose you have DataFrame with 10 columns of real numbers, for example:

df = pd.DataFrame(np.random.random(size=(5, 10)), columns=list('abcdefghij'))
Which column of numbers has the smallest sum? Return that column's label.

In [116]:
df = pd.DataFrame(np.random.random(size=(5, 10)))
print(df.sum().idxmin())

3
