In [None]:
'''
What is Pandas?
Pandas is a Python library used for working with data sets.

It has functions for analyzing, cleaning, exploring, and manipulating data.

Why Use Pandas?
Pandas allows us to analyze big data and make conclusions based on statistical theories.

Pandas can clean messy data sets, and make them readable and relevant.

Relevant data is very important in data science.
'''

In [1]:
#!pip install pandas
import pandas as pd
import numpy as np
'''
What is a Series?
A Pandas Series is like a column in a table.

It is a one-dimensional array holding data of any type.
'''
s1 = pd.Series([23,24,25,12,32])
s1

0    23
1    24
2    25
3    12
4    32
dtype: int64

In [2]:
s2 = pd.Series([23,45,67,12,34],index = ['a','b','c','d','e'])
s2

a    23
b    45
c    67
d    12
e    34
dtype: int64

In [3]:
s3 = pd.Series([23,45,67,12,34],index = ['a','b','c','d','e'],dtype = 'float')
s3

a    23.0
b    45.0
c    67.0
d    12.0
e    34.0
dtype: float64

In [4]:
'''
Labels
If nothing else is specified, the values are labeled with their index number. First value has index 0, second value has index 1 etc.

This label can be used to access a specified value.'''
s3['a']

23.0

### creation of Series using dictionary

In [5]:
s4 = pd.Series({'a':65,'b':43})
s4

a    65
b    43
dtype: int64

### DataFrame
What is a DataFrame?
A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [6]:
d1 = pd.DataFrame([43,54,65,76])
d1

Unnamed: 0,0
0,43
1,54
2,65
3,76


In [7]:
d2 = pd.DataFrame([[2,3,4],[4,5,6],[1,2,3]])
d2

Unnamed: 0,0,1,2
0,2,3,4
1,4,5,6
2,1,2,3


In [8]:
d2 = pd.DataFrame(s2)  #converting series into dataframe
d2

Unnamed: 0,0
a,23
b,45
c,67
d,12
e,34


In [9]:
d3  = pd.DataFrame([[2,3,4],[4,5,6],[1,2,3]],columns =['a','b','c'])
d3

Unnamed: 0,a,b,c
0,2,3,4
1,4,5,6
2,1,2,3


In [10]:
d3  = pd.DataFrame([[2,3,4],[4,5,6],[1,2,3]],columns =['a','b','c'],index = ['x','y','z'])
d3

Unnamed: 0,a,b,c
x,2,3,4
y,4,5,6
z,1,2,3


### creating DataFrame from list of dictionaries

In [11]:
dic = [{'alex':1,'joe':2},{'ema':5,'dora':10,'alice':20}]    
pd.DataFrame(dic,index=['a','b'])

Unnamed: 0,alex,joe,ema,dora,alice
a,1.0,2.0,,,
b,,,5.0,10.0,20.0


### DataFrame operations

In [12]:
d3

Unnamed: 0,a,b,c
x,2,3,4
y,4,5,6
z,1,2,3


In [13]:
d3['a']

x    2
y    4
z    1
Name: a, dtype: int64

In [14]:
d3['d']=d3['a']*d3['b']
d3

Unnamed: 0,a,b,c,d
x,2,3,4,6
y,4,5,6,20
z,1,2,3,2


In [15]:
pop = d3.pop('c') 
d3

Unnamed: 0,a,b,d
x,2,3,6
y,4,5,20
z,1,2,2


In [16]:
d3

Unnamed: 0,a,b,d
x,2,3,6
y,4,5,20
z,1,2,2


In [17]:
del d3['d']   

In [18]:
d3

Unnamed: 0,a,b
x,2,3
y,4,5
z,1,2


In [19]:
d3.insert(1,'name',d3['a'])
d3

Unnamed: 0,a,name,b
x,2,2,3
y,4,4,5
z,1,1,2


In [23]:
import numpy as np
d4 = pd.DataFrame({'abc':np.random.randint(2,6,size = (10)),'bcd':np.random.randint(4,10,size = (10)),'cde':np.random.randint(3,10,size = (10))})
d4

Unnamed: 0,abc,bcd,cde
0,2,4,8
1,2,8,8
2,5,4,7
3,3,4,3
4,4,7,7
5,3,7,7
6,2,4,6
7,4,9,7
8,3,6,5
9,3,7,6


In [24]:
d4.head()

Unnamed: 0,abc,bcd,cde
0,2,4,8
1,2,8,8
2,5,4,7
3,3,4,3
4,4,7,7


In [25]:
d4.tail()

Unnamed: 0,abc,bcd,cde
5,3,7,7
6,2,4,6
7,4,9,7
8,3,6,5
9,3,7,6


In [26]:
'''
Info About the Data
The DataFrames object has a method called info(), that gives you more information about the data set.'''
d4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   abc     10 non-null     int32
 1   bcd     10 non-null     int32
 2   cde     10 non-null     int32
dtypes: int32(3)
memory usage: 248.0 bytes


In [27]:
d4

Unnamed: 0,abc,bcd,cde
0,2,4,8
1,2,8,8
2,5,4,7
3,3,4,3
4,4,7,7
5,3,7,7
6,2,4,6
7,4,9,7
8,3,6,5
9,3,7,6


In [28]:
d4.loc[9,'cde']

6

In [29]:
d4.loc[4:9,['abc','cde']]

Unnamed: 0,abc,cde
4,4,7
5,3,7
6,2,6
7,4,7
8,3,5
9,3,6


In [30]:
d4.loc[[3,4,7],['abc','cde']]


Unnamed: 0,abc,cde
3,3,3
4,4,7
7,4,7


In [32]:
d4

Unnamed: 0,abc,bcd,cde
0,2,4,8
1,2,8,8
2,5,4,7
3,3,4,3
4,4,7,7
5,3,7,7
6,2,4,6
7,4,9,7
8,3,6,5
9,3,7,6


In [31]:
d4.iloc[9,2]  #[row,column]

6

In [33]:
d4.iloc[2:7,[0,2]]

Unnamed: 0,abc,cde
2,5,7
3,3,3
4,4,7
5,3,7
6,2,6


In [34]:
d4.abc

0    2
1    2
2    5
3    3
4    4
5    3
6    2
7    4
8    3
9    3
Name: abc, dtype: int32

In [35]:
d4.abc.values

array([2, 2, 5, 3, 4, 3, 2, 4, 3, 3])

In [36]:
d4['sum'] = d4.abc.values + d4.cde.values + d4.bcd.values
d4

Unnamed: 0,abc,bcd,cde,sum
0,2,4,8,14
1,2,8,8,18
2,5,4,7,16
3,3,4,3,10
4,4,7,7,18
5,3,7,7,17
6,2,4,6,12
7,4,9,7,20
8,3,6,5,14
9,3,7,6,16


In [37]:
a = [['rk',102,15000],['rama',103,20000],['krishna',104,25000]]
dfl = pd.DataFrame(a,columns = ['name','id','salary'])
dfl

Unnamed: 0,name,id,salary
0,rk,102,15000
1,rama,103,20000
2,krishna,104,25000


In [39]:
y=dfl[dfl.salary>=20000]
print(y)
y[['id','salary']]

      name   id  salary
1     rama  103   20000
2  krishna  104   25000


Unnamed: 0,id,salary
1,103,20000
2,104,25000


In [44]:
dfl.append({'name' : 'sajan','id' : 105,'salary' : 30000},ignore_index=True)

Unnamed: 0,name,id,salary
0,rk,102,15000
1,rama,103,20000
2,krishna,104,25000
3,sajan,105,30000


In [45]:
dfl = dfl.append({'name' : np.nan,'id' : 105,'salary' : 30000},ignore_index = True)
dfl

Unnamed: 0,name,id,salary
0,rk,102.0,15000.0
1,rama,103.0,20000.0
2,krishna,104.0,25000.0
3,,105.0,30000.0


In [46]:
dfl.isnull()

Unnamed: 0,name,id,salary
0,False,False,False
1,False,False,False
2,False,False,False
3,True,False,False


In [47]:
dfl.isnull().sum()

name      1
id        0
salary    0
dtype: int64

In [51]:
df1=dfl.dropna()

In [49]:
dfl

Unnamed: 0,name,id,salary
0,rk,102.0,15000.0
1,rama,103.0,20000.0
2,krishna,104.0,25000.0
3,,105.0,30000.0


In [52]:
dfl.fillna(value = 'abc')

Unnamed: 0,name,id,salary
0,rk,102.0,15000.0
1,rama,103.0,20000.0
2,krishna,104.0,25000.0
3,abc,105.0,30000.0


### groupby

In [53]:
df = pd.DataFrame({'Animal' : ['Falcon', 'Falcon','Parrot', 'Parrot'],'Max Speed' : [380., 370., 24., 26.]})
df

Unnamed: 0,Animal,Max Speed
0,Falcon,380.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [54]:
df.groupby(['Animal']).mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,375.0
Parrot,25.0


In [55]:
df_csv = pd.read_csv('data.csv')
df_csv

Unnamed: 0.1,Unnamed: 0,COUNTRY,POP,AREA,GDP,CONT,IND_DAY
0,CHN,China,1398.72,9596.96,12234.78,Asia,
1,IND,India,1351.16,3287.26,2575.67,Asia,1947-08-15
2,USA,US,329.74,9833.52,19485.39,N.America,1776-07-04
3,IDN,Indonesia,268.07,1910.93,1015.54,Asia,1945-08-17
4,BRA,Brazil,210.32,8515.77,2055.51,S.America,1822-09-07
5,PAK,Pakistan,205.71,881.91,302.14,Asia,1947-08-14
6,NGA,Nigeria,200.96,923.77,375.77,Africa,1960-10-01
7,BGD,Bangladesh,167.09,147.57,245.63,Asia,1971-03-26
8,RUS,Russia,146.79,17098.25,1530.75,,1992-06-12
9,MEX,Mexico,126.58,1964.38,1158.23,N.America,1810-09-16


In [56]:
df_csv.columns


Index(['Unnamed: 0', 'COUNTRY', 'POP', 'AREA', 'GDP', 'CONT', 'IND_DAY'], dtype='object')

In [57]:
df_csv['COUNTRY'].values

array(['China', 'India', 'US', 'Indonesia', 'Brazil', 'Pakistan',
       'Nigeria', 'Bangladesh', 'Russia', 'Mexico', 'Japan', 'Germany',
       'France', 'UK', 'Italy', 'Argentina', 'Algeria', 'Canada',
       'Australia', 'Kazakhstan'], dtype=object)

In [58]:
df_csv['COUNTRY'].isnull().sum()

0

In [59]:
import pandas as pd
import numpy as np
 
df = pd.DataFrame(np.random.randn(4,3),columns=['col1','col2','col3'])
print(df)
print('\n')
for key,value in df.iteritems():
   print(key,value)
   print('\n')

       col1      col2      col3
0 -0.720782  0.003498  2.037310
1  1.351015  2.726483  0.479649
2  0.519086 -0.565398 -1.412205
3  0.918665  0.091409  1.391342


col1 0   -0.720782
1    1.351015
2    0.519086
3    0.918665
Name: col1, dtype: float64


col2 0    0.003498
1    2.726483
2   -0.565398
3    0.091409
Name: col2, dtype: float64


col3 0    2.037310
1    0.479649
2   -1.412205
3    1.391342
Name: col3, dtype: float64




In [60]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(3,3),columns = ['col1','col2','col3'])
print(df)
print('\n')
for row in df.iterrows():
   print(row)
   print('\n')

       col1      col2      col3
0  0.642378  1.529315 -1.317822
1 -0.716971 -0.504900  0.193148
2  0.510717  2.763265  0.358640


(0, col1    0.642378
col2    1.529315
col3   -1.317822
Name: 0, dtype: float64)


(1, col1   -0.716971
col2   -0.504900
col3    0.193148
Name: 1, dtype: float64)


(2, col1    0.510717
col2    2.763265
col3    0.358640
Name: 2, dtype: float64)




In [None]:

'''
Finding Relationships
A great aspect of the Pandas module is the corr() method.

The corr() method calculates the relationship between each column in your data set.
'''

In [61]:
df.corr()

Unnamed: 0,col1,col2,col3
col1,1.0,0.89108,-0.49847
col2,0.89108,1.0,-0.050734
col3,-0.49847,-0.050734,1.0
