In [1]:
import pandas as pd
import numpy as np

### This notebooks contains introduction to Series and DataFrame

## Series

- Series is a one-dimensional labeled array capable of holding data of any type (integer, string, float, python objects, etc.)
- The axis labels are collectively called index.

A series can be created using various inputs like −
- Array
- Dict
- Scalar value or constant

pandas.Series( data, index, dtype, copy)

#### Creating Empty Series

In [8]:
s = pd.Series()
print(s)

Series([], dtype: float64)


  s = pd.Series()


#### Creating Series using ndarray

In [15]:
data = np.array(['a', 'b', 'c', 'd'])
series = pd.Series(data)
print(series)

0    a
1    b
2    c
3    d
dtype: object


#### Customized Index

In [22]:
data = np.array([1,2,3,4])
series = pd.Series(data, index = ['apple','ball','cat','dog'])
print(series)

apple    1
ball     2
cat      3
dog      4
dtype: int32


In [26]:
series = pd.Series(data = [4,3,2,1], index = ['apple','ball','cat','dog'])
print(series)

apple    4
ball     3
cat      2
dog      1
dtype: int64


#### Create a Series from dict

In [30]:
data = {'a':1, 'b':2, 'c':3}
series = pd.Series(data)
print(series)

a    1
b    2
c    3
dtype: int64


In [32]:
data = {'a':1, 'b':2, 'c':3}
series = pd.Series(data, index = ['b', 'd', 'a', 'c'])
print(series)

b    2.0
d    NaN
a    1.0
c    3.0
dtype: float64


#### Create a Series from Scalar

In [38]:
series = pd.Series(data = 'apple', index = [1,2,3,4])
series

1    apple
2    apple
3    apple
4    apple
dtype: object

#### Accessing Data from Series with Position

In [56]:
series = pd.Series(data = ['apple', 'ball', 'cat', 'dog'])
series[:3]

0    apple
1     ball
2      cat
dtype: object

In [57]:
series[-2:]

2    cat
3    dog
dtype: object

#### Retrieve Data Using Label (Index)

In [58]:
s = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])

In [65]:
s['d']

4

In [66]:
s[:'c']

a    1
b    2
c    3
dtype: int64

In [67]:
s['c':]

c    3
d    4
e    5
dtype: int64

In [62]:
s[['a', 'c', 'e']]     # retrieving multiple elements  

a    1
c    3
e    5
dtype: int64

In [63]:
s['a':'d']

a    1
b    2
c    3
d    4
dtype: int64

## DataFrame

- A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns

Features of DataFrame:
- Potentially columns are of different types
- Size – Mutable
- Labeled axes (rows and columns)
- Can Perform Arithmetic operations on rows and columns

pandas.DataFrame(data, index, columns, dtype, copy)

A pandas DataFrame can be created using various inputs like:

- Lists
- dictionaries
- Series
- Numpy ndarrays
- Another DataFrame

#### Creating empty dataframe

In [73]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


#### Create a DataFrame from Lists

In [75]:
data = [1,2,3,4,5]
df = pd.DataFrame(data)
print(df)             # creating dataframe using single list

   0
0  1
1  2
2  3
3  4
4  5


In [78]:
# name, age
data = [['prajwal', 22], ['nikhil', 23], ['samiksh', 24]]
df = pd.DataFrame(data = data, columns = ['Name', 'Age'])
print(df)  # creating dataframe using multiple lists

      Name  Age
0  prajwal   22
1   nikhil   23
2  samiksh   24


In [94]:
# name, age
data = [['prajwal', 22], ['nikhil', 23], ['samiksh', 24]]
df = pd.DataFrame(data = data, columns = ['Name', 'Age'], dtype = int)
print(df)

      Name   Age
0  prajwal  22.2
1   nikhil  23.0
2  samiksh  24.0


In [96]:
# name, age
data = [['prajwal', 22], ['nikhil', 23], ['samiksh', 24]]
df = pd.DataFrame(data = data, index = ['Basketball', 'Bikes', 'Travel'], columns = ['Name', 'Age'])
print(df)  # creating dataframe using multiple lists

               Name  Age
Basketball  prajwal   22
Bikes        nikhil   23
Travel      samiksh   24


#### Create a DataFrame from Dict of ndarrays / Lists

In [105]:
data = {'Name': ['prajwal', 'nikhil', 'samiksh'], 'Age': [22,23,24]}
df = pd.DataFrame(data = data, columns = ['Age', 'Name'])
print(df)

   Age     Name
0   22  prajwal
1   23   nikhil
2   24  samiksh


In [106]:
data = {'Name': ['prajwal', 'nikhil', 'samiksh'], 'Age': [22,23,24]}
df = pd.DataFrame(data = data, columns = ['Age', 'Name'], index = ['rank1', 'rank2', 'rank3'])
print(df)

       Age     Name
rank1   22  prajwal
rank2   23   nikhil
rank3   24  samiksh


#### Create a DataFrame from List of Dicts

In [108]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
print(df)

   a   b     c
0  1   2   NaN
1  5  10  20.0


In [109]:
data = [{'Name': 'prajwal', 'Age': 22}, {'Name': 'nikhil', 'Age': 23, 'Hobby': 'Bikes'}]
df = pd.DataFrame(data)
print(df)

      Name  Age  Hobby
0  prajwal   22    NaN
1   nikhil   23  Bikes


In [183]:
data = [{'Name': 'prajwal', 'Age': 22}, {'Name': 'nikhil', 'Age': 23, 'Hobby': 'Bikes'}]
df = pd.DataFrame(data, index = ['first', 'second'])
print(df)

           Name  Age  Hobby
first   prajwal   22    NaN
second   nikhil   23  Bikes


In [119]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]

#With two column indices, values same as dictionary keys
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b1'])

#With two column indices with one index with other name
df2 = pd.DataFrame(data, index=['first', 'second'])
print(df1,'\n')
print(df2)

        a  b1
first   1 NaN
second  5 NaN 

        a   b     c
first   1   2   NaN
second  5  10  20.0


#### Create a DataFrame from Dict of Series

In [122]:
ds = {'Name': pd.Series(['prajwal', 'nikhil', 'samiksh']),
     'Age': pd.Series([22,23,24,25])}
df = pd.DataFrame(data = ds)
print(df)

      Name  Age
0  prajwal   22
1   nikhil   23
2  samiksh   24
3      NaN   25


#### DataFrame Column Selection, Addition, Deletion

#### Column Selection

In [127]:
ds = {'one': pd.Series(['apple', 'bat', 'cat', 'dog']),
     'two': pd.Series([100, 200, 300, 400, 500])}
df = pd.DataFrame(ds)
print(df)

     one  two
0  apple  100
1    bat  200
2    cat  300
3    dog  400
4    NaN  500


In [128]:
df['one']

0    apple
1      bat
2      cat
3      dog
4      NaN
Name: one, dtype: object

In [129]:
df.two

0    100
1    200
2    300
3    400
4    500
Name: two, dtype: int64

#### Column Addition

In [190]:
ds = {'one': pd.Series(['apple', 'bat', 'cat', 'dog']),
     'two': pd.Series([100, 200, 300, 400, 500])}
df = pd.DataFrame(ds)
print(df)

     one  two
0  apple  100
1    bat  200
2    cat  300
3    dog  400
4    NaN  500


In [144]:
# adding a new column to the existing dataframe
df['three'] = pd.Series([1,2,3,4])
print(df)

     one  two  three
0  apple  100    1.0
1    bat  200    2.0
2    cat  300    3.0
3    dog  400    4.0
4    NaN  500    NaN


In [145]:
# adding a new column to the dataframe using existing columns
df['four'] = df['two'] + df['three']
print(df)

     one  two  three   four
0  apple  100    1.0  101.0
1    bat  200    2.0  202.0
2    cat  300    3.0  303.0
3    dog  400    4.0  404.0
4    NaN  500    NaN    NaN


#### Column Deletion

In [146]:
print(df)

     one  two  three   four
0  apple  100    1.0  101.0
1    bat  200    2.0  202.0
2    cat  300    3.0  303.0
3    dog  400    4.0  404.0
4    NaN  500    NaN    NaN


In [148]:
# Deleting column using 'del' function
del df['four']
print(df)

     one  two  three
0  apple  100    1.0
1    bat  200    2.0
2    cat  300    3.0
3    dog  400    4.0
4    NaN  500    NaN


In [149]:
# Deleting column using 'pop' function
df.pop('three')
print(df)

     one  two
0  apple  100
1    bat  200
2    cat  300
3    dog  400
4    NaN  500


#### Row Selection, Addition, and Deletion

#### Row Selection

1. Selection by label:
Rows can be selected by passing rows to 'loc' function

In [200]:
df_loc = pd.DataFrame([['prajwal', 21], ['nikhil', 22], ['samiksh', 23]], columns = ['Name', 'Age'], 
                      index = ['basketball', 'bikes', 'travel'])
print(df_loc)

               Name  Age
basketball  prajwal   21
bikes        nikhil   22
travel      samiksh   23


In [203]:
df_loc.loc['basketball']

Name    prajwal
Age          21
Name: basketball, dtype: object

In [207]:
df_loc.loc[['bikes', 'travel']]

Unnamed: 0,Name,Age
bikes,nikhil,22
travel,samiksh,23


2. Selection by integer location: 
Rows can be selected by passing integer location to an 'iloc' function

In [191]:
ds = {'one': pd.Series(['apple', 'bat', 'cat', 'dog']),
     'two': pd.Series([100, 200, 300, 400, 500])}
df = pd.DataFrame(ds)
print(df)

     one  two
0  apple  100
1    bat  200
2    cat  300
3    dog  400
4    NaN  500


In [180]:
df.iloc[1]

one    bat
two    200
Name: 1, dtype: object

In [192]:
df.iloc[0:2]

Unnamed: 0,one,two
0,apple,100
1,bat,200


In [193]:
df.iloc[2:]

Unnamed: 0,one,two
2,cat,300
3,dog,400
4,,500


In [198]:
df.iloc[[1,3,0]]

Unnamed: 0,one,two
1,bat,200
3,dog,400
0,apple,100


#### Addition of Rows

In [215]:
df1 = pd.DataFrame([['prajwal', 21], ['nikhil', 22]], columns = ['Name', 'Age'])
print(df1)

      Name  Age
0  prajwal   21
1   nikhil   22


In [4]:
df2 = pd.DataFrame({'Name': 'samiksh'})
print(df2)

ValueError: If using all scalar values, you must pass an index

In [218]:
# adding df2 to df1 using 'append' function
df1 = df1.append(df2)
print(df1)

      Name  Age
0  prajwal   21
1   nikhil   22
0  samiksh   23


#### Deletion of Rows

In [240]:
df4 = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df5 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])

df4 = df4.append(df5)
print(df4)

   a  b
0  1  2
1  3  4
0  5  6
1  7  8


Use index label to delete or drop rows from a DataFrame. If label is duplicated, then multiple rows will be dropped.

If you observe, in the above example, the labels are duplicate. Let us drop a label and will see how many rows will get dropped.

In [241]:
# deleting the rows using 'drop' function
df4 = df4.drop(0)
print(df4)

   a  b
1  3  4
1  7  8
