## Pandas Introduction for Data Science

In [2]:
import numpy as np
import pandas as pd

### Series

In [3]:
s = ([1, np.nan, ' Pandas Library '])
s1 = pd.Series(s)
s1

0                   1
1                 NaN
2     Pandas Library 
dtype: object

In [4]:
s2 = np.array([2, np.nan, 'b'])
s2 = pd.Series(s2)
s2

0      2
1    nan
2      b
dtype: object

In [5]:
list1 = ['animal', '2', 'animal']
s3 = pd.Series(list1)
s3

0    animal
1         2
2    animal
dtype: object

In [6]:
list2 = [3, 'c', 'Numpy']
s4 = pd.Series(list2)
s4

0        3
1        c
2    Numpy
dtype: object

In [7]:
s5 = pd.Series({'A': 1, '3': 'Python'})
s5

A         1
3    Python
dtype: object

In [8]:
s6 = pd.Series({'Integer': 3, 'B': 'Boys'})
s6

Integer       3
B          Boys
dtype: object

### DataFrames

In [9]:
df = pd.DataFrame(np.random.randn(6, 4))
df

Unnamed: 0,0,1,2,3
0,2.062105,-2.159547,1.011744,0.044253
1,-0.164987,0.957709,0.255175,0.495854
2,-0.087643,0.620048,-0.237486,-0.908896
3,0.922198,-1.029264,-1.022087,1.525837
4,1.341478,-0.59993,0.421164,1.114126
5,0.270911,0.962518,-0.004818,1.207774


In [10]:
df1 = pd.DataFrame(np.random.randn(3, 3))
df1

Unnamed: 0,0,1,2
0,2.117409,-2.125425,-1.714879
1,-0.528181,1.08992,-0.848955
2,-0.101371,-0.053897,-0.025465


In [11]:
df2 = pd.DataFrame({
    'A': 1,
    'number': np.array([6] * 3, dtype='int32')
})
df2

Unnamed: 0,A,number
0,1,6
1,1,6
2,1,6


In [12]:
df3 = pd.DataFrame({
    'E': np.array([4] * 5, dtype='int32'),
    'Day': 2
})
df3

Unnamed: 0,E,Day
0,4,2
1,4,2
2,4,2
3,4,2
4,4,2


### Files

In [14]:
df4 = pd.read_csv('practice_data.csv')
df4.head(3)

Unnamed: 0,A,B,C,D
0,,-1.604969,-0.106263,-1.002924
1,-0.404667,0.458565,-1.174912,
2,0.440231,1.71348,0.162473,-1.632132


In [18]:
df5 = pd.read_json('practice_data.json')
df5.head(3)

Unnamed: 0,A,B,C,D
0,,-1.604969,-0.106263,-1.002924
1,-0.404667,0.458565,-1.174912,
2,0.440231,1.71348,0.162473,-1.632132


In [17]:
df6 = pd.read_csv('https://raw.githubusercontent.com/VinitaSilaparasetty/Coursera-Pandas-for-Beginners/master/practicedata.csv')
df6.head(3)

Unnamed: 0,A,B,C,D
0,,-1.604969,-0.106263,-1.002924
1,-0.404667,0.458565,-1.174912,
2,0.440231,1.71348,0.162473,-1.632132


In [19]:
df7 = pd.read_json('https://raw.githubusercontent.com/VinitaSilaparasetty/Coursera-Pandas-for-Beginners/master/practicedata.json')
df7.head(3)

Unnamed: 0,A,B,C,D
0,,-1.604969,-0.106263,-1.002924
1,-0.404667,0.458565,-1.174912,
2,0.440231,1.71348,0.162473,-1.632132


In [20]:
df4.to_csv('practice_data_2.csv')

In [22]:
df4.to_json('practice_data_2.json')

### Basic Cleaning

In [23]:
df6

Unnamed: 0,A,B,C,D
0,,-1.604969,-0.106263,-1.002924
1,-0.404667,0.458565,-1.174912,
2,0.440231,1.71348,0.162473,-1.632132
3,0.08105,0.639851,0.844037,1.463154
4,-1.013616,-0.224553,1.786915,1.041241
5,-1.013616,-0.224553,1.786915,1.041241
6,-0.488965,0.946528,0.829525,-0.529912
7,-0.488965,0.946528,0.829525,-0.529912


In [24]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
A    7 non-null float64
B    8 non-null float64
C    8 non-null float64
D    7 non-null float64
dtypes: float64(4)
memory usage: 384.0 bytes


In [25]:
df6.describe()

Unnamed: 0,A,B,C,D
count,7.0,8.0,8.0,7.0
mean,-0.41265,0.33136,0.619777,-0.021321
std,0.533101,1.010369,0.98705,1.192849
min,-1.013616,-1.604969,-1.174912,-1.632132
25%,-0.751291,-0.224553,0.095289,-0.766418
50%,-0.488965,0.549208,0.829525,-0.529912
75%,-0.161808,0.946528,1.079757,1.041241
max,0.440231,1.71348,1.786915,1.463154


In [26]:
df6.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [27]:
df7.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [28]:
df6.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [29]:
df7.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [31]:
df6.duplicated()

0    False
1    False
2    False
3    False
4    False
5     True
6    False
7     True
dtype: bool

In [33]:
df6.drop_duplicates()

Unnamed: 0,A,B,C,D
0,,-1.604969,-0.106263,-1.002924
1,-0.404667,0.458565,-1.174912,
2,0.440231,1.71348,0.162473,-1.632132
3,0.08105,0.639851,0.844037,1.463154
4,-1.013616,-0.224553,1.786915,1.041241
6,-0.488965,0.946528,0.829525,-0.529912


In [34]:
df7.duplicated()

0    False
1    False
2    False
3    False
4    False
5     True
6    False
7     True
dtype: bool

In [35]:
df7.drop_duplicates()

Unnamed: 0,A,B,C,D
0,,-1.604969,-0.106263,-1.002924
1,-0.404667,0.458565,-1.174912,
2,0.440231,1.71348,0.162473,-1.632132
3,0.08105,0.639851,0.844037,1.463154
4,-1.013616,-0.224553,1.786915,1.041241
6,-0.488965,0.946528,0.829525,-0.529912


In [37]:
df6.isnull().sum()

A    1
B    0
C    0
D    1
dtype: int64

In [38]:
df6.dropna()

Unnamed: 0,A,B,C,D
2,0.440231,1.71348,0.162473,-1.632132
3,0.08105,0.639851,0.844037,1.463154
4,-1.013616,-0.224553,1.786915,1.041241
5,-1.013616,-0.224553,1.786915,1.041241
6,-0.488965,0.946528,0.829525,-0.529912
7,-0.488965,0.946528,0.829525,-0.529912


In [39]:
df7.isnull().sum()

A    1
B    0
C    0
D    1
dtype: int64

In [40]:
df7.dropna()

Unnamed: 0,A,B,C,D
2,0.440231,1.71348,0.162473,-1.632132
3,0.08105,0.639851,0.844037,1.463154
4,-1.013616,-0.224553,1.786915,1.041241
5,-1.013616,-0.224553,1.786915,1.041241
6,-0.488965,0.946528,0.829525,-0.529912
7,-0.488965,0.946528,0.829525,-0.529912


### Numeric Operations

In [41]:
df8 = pd.DataFrame(np.random.randn(6, 4))
df8

Unnamed: 0,0,1,2,3
0,-0.7738,1.961943,1.228608,-0.332881
1,0.541899,-0.598531,-0.013082,-0.432957
2,-0.46933,1.108538,1.172529,-0.168579
3,-0.58921,-1.943765,1.201361,-0.841672
4,0.059273,-0.558555,0.599886,-1.463366
5,0.645517,1.404961,0.669673,1.805743


In [42]:
df8.mean()

0   -0.097608
1    0.229098
2    0.809829
3   -0.238952
dtype: float64

In [43]:
df9 = pd.DataFrame(np.random.randn(3, 4))
df9

Unnamed: 0,0,1,2,3
0,-0.896471,-1.12733,-1.38572,1.182106
1,0.035699,-0.085223,0.247028,-0.50984
2,-1.2034,0.509247,0.334236,-0.412726


In [44]:
df9.mean()

0   -0.688057
1   -0.234436
2   -0.268152
3    0.086513
dtype: float64

In [45]:
df8.apply(np.cumsum)

Unnamed: 0,0,1,2,3
0,-0.7738,1.961943,1.228608,-0.332881
1,-0.2319,1.363412,1.215526,-0.765839
2,-0.70123,2.471949,2.388055,-0.934417
3,-1.29044,0.528184,3.589415,-1.77609
4,-1.231166,-0.030371,4.189302,-3.239456
5,-0.585649,1.37459,4.858974,-1.433713


In [46]:
df9.apply(np.cumsum)

Unnamed: 0,0,1,2,3
0,-0.896471,-1.12733,-1.38572,1.182106
1,-0.860772,-1.212553,-1.138692,0.672266
2,-2.064171,-0.703307,-0.804456,0.25954


In [47]:
df8.max()

0    0.645517
1    1.961943
2    1.228608
3    1.805743
dtype: float64

In [48]:
df9.max()

0    0.035699
1    0.509247
2    0.334236
3    1.182106
dtype: float64

In [49]:
df8.min()

0   -0.773800
1   -1.943765
2   -0.013082
3   -1.463366
dtype: float64

In [50]:
df9.min()

0   -1.20340
1   -1.12733
2   -1.38572
3   -0.50984
dtype: float64

### Strings Operators 1

In [51]:
s7 = np.array(['animal', 'bird', ' Pandas Library '])
s7 = pd.Series(s7)
s7

0              animal
1                bird
2     Pandas Library 
dtype: object

In [52]:
s7.str.lower()

0              animal
1                bird
2     pandas library 
dtype: object

In [53]:
s8 = np.array(['cat', 'dog', ' Python Programming '])
s8 = pd.Series(s8)
s8

0                     cat
1                     dog
2     Python Programming 
dtype: object

In [54]:
s8.str.lower()

0                     cat
1                     dog
2     python programming 
dtype: object

In [55]:
s7.str.upper()

0              ANIMAL
1                BIRD
2     PANDAS LIBRARY 
dtype: object

In [56]:
s8.str.upper()

0                     CAT
1                     DOG
2     PYTHON PROGRAMMING 
dtype: object

In [57]:
s7.str.swapcase()

0              ANIMAL
1                BIRD
2     pANDAS lIBRARY 
dtype: object

In [58]:
s8.str.swapcase()

0                     CAT
1                     DOG
2     pYTHON pROGRAMMING 
dtype: object

In [60]:
s7.str.len()

0     6
1     4
2    16
dtype: int64

In [61]:
s8.str.len()

0     3
1     3
2    20
dtype: int64

In [62]:
s7.str.split()

0             [animal]
1               [bird]
2    [Pandas, Library]
dtype: object

In [63]:
s8.str.split()

0                    [cat]
1                    [dog]
2    [Python, Programming]
dtype: object

In [64]:
s7.unique()

array(['animal', 'bird', ' Pandas Library '], dtype=object)

In [65]:
s8.unique()

array(['cat', 'dog', ' Python Programming '], dtype=object)

In [66]:
repeat_list = [2, 3, 4]
s7.str.repeat(repeat_list)

0                                         animalanimal
1                                         birdbirdbird
2     Pandas Library  Pandas Library  Pandas Librar...
dtype: object

In [67]:
repeat_list = [1, 2, 3]
s8.str.repeat(repeat_list)

0                                                  cat
1                                               dogdog
2     Python Programming  Python Programming  Pytho...
dtype: object