In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {'name':["Asad","Saad","Fahad", 'Ali'], 
        'age':[23,34,23,21], 
        'AiforEveryOne':[89,78,90,98],
        'python':[78,89,87,89],
        'git': [90,98,87,86],
        'numpy':[98,87,98,99]       }
        
data = pd.DataFrame(data)
data

Unnamed: 0,name,age,AiforEveryOne,python,git,numpy
0,Asad,23,89,78,90,98
1,Saad,34,78,89,98,87
2,Fahad,23,90,87,87,98
3,Ali,21,98,89,86,99


In [3]:
data['total'] = data['AiforEveryOne'] + data['python'] + data['git'] + data['numpy']

In [4]:
data['percent'] = data['total'] / 400 * 100

In [5]:
data

Unnamed: 0,name,age,AiforEveryOne,python,git,numpy,total,percent
0,Asad,23,89,78,90,98,355,88.75
1,Saad,34,78,89,98,87,352,88.0
2,Fahad,23,90,87,87,98,362,90.5
3,Ali,21,98,89,86,99,372,93.0


### Index Objects

pandas’s Index objects are responsible for holding the axis labels and other metadata
(like the axis name or names). Any array or other sequence of labels you use when
constructing a Series or DataFrame is internally converted to an Index:

In [6]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [7]:
index = obj.index

In [8]:
index[1:]

Index(['b', 'c'], dtype='object')

In [9]:
# index[0] = 'g' # indices immutable

In [10]:
# creating an ndarray that is immutable
# coz created via Index function and index are immutable
labels = pd.Index(["a","b","c","d","e","f"])

In [11]:
labels

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [12]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year':  [2000, 2001, 2002, 2001, 2002, 2003],
        'pop':   [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [13]:
frame.index = labels

In [14]:
frame

Unnamed: 0,state,year,pop
a,Ohio,2000,1.5
b,Ohio,2001,1.7
c,Ohio,2002,3.6
d,Nevada,2001,2.4
e,Nevada,2002,2.9
f,Nevada,2003,3.2


In [15]:
frame.index = [0, 1, 2, 3, 4, 5]

In [16]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [17]:
pd.DataFrame(frame, columns=['year', 'state', 'pop']) # inplace nahi hoga 

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [18]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [19]:
reindex_frame = frame.reindex(columns=['pop','year','imports', 'debt', 'state',"exports" ])

In [20]:
reindex_frame

Unnamed: 0,pop,year,imports,debt,state,exports
0,1.5,2000,,,Ohio,
1,1.7,2001,,,Ohio,
2,3.6,2002,,,Ohio,
3,2.4,2001,,,Nevada,
4,2.9,2002,,,Nevada,
5,3.2,2003,,,Nevada,


In [21]:
reindex_frame_drop = reindex_frame.drop([0, 3]) # default: axis=0
reindex_frame_drop

Unnamed: 0,pop,year,imports,debt,state,exports
1,1.7,2001,,,Ohio,
2,3.6,2002,,,Ohio,
4,2.9,2002,,,Nevada,
5,3.2,2003,,,Nevada,


In [22]:
reindex_frame

Unnamed: 0,pop,year,imports,debt,state,exports
0,1.5,2000,,,Ohio,
1,1.7,2001,,,Ohio,
2,3.6,2002,,,Ohio,
3,2.4,2001,,,Nevada,
4,2.9,2002,,,Nevada,
5,3.2,2003,,,Nevada,


In [23]:
reindex_frame_drop2 = reindex_frame.drop(['exports', 'debt'], axis=1)
reindex_frame_drop2

Unnamed: 0,pop,year,imports,state
0,1.5,2000,,Ohio
1,1.7,2001,,Ohio
2,3.6,2002,,Ohio
3,2.4,2001,,Nevada
4,2.9,2002,,Nevada
5,3.2,2003,,Nevada


In [24]:
exp = reindex_frame['exports']
exp.size - exp.isnull().sum() # all null

0

### Another Example

In [25]:
index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
df = pd.DataFrame({
                      'http_status': [200,200,404,404,301],
                      'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
                                                                      index=index)
df

Unnamed: 0,http_status,response_time
Firefox,200,0.04
Chrome,200,0.02
Safari,404,0.07
IE10,404,0.08
Konqueror,301,1.0


In [26]:
new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10','Chrome']
df.reindex(new_index)

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
Iceweasel,,
Comodo Dragon,,
IE10,404.0,0.08
Chrome,200.0,0.02


In [27]:
new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10','Chrome']
df.reindex(new_index, fill_value=0)

Unnamed: 0,http_status,response_time
Safari,404,0.07
Iceweasel,0,0.0
Comodo Dragon,0,0.0
IE10,404,0.08
Chrome,200,0.02


In [28]:
# We can also reindex the columns.

df.reindex(columns=['http_status', 'user_agent'])

Unnamed: 0,http_status,user_agent
Firefox,200,
Chrome,200,
Safari,404,
IE10,404,
Konqueror,301,


In [29]:
# We can also reindex the columns.

df.reindex(['http_status', 'user_agent'], axis='columns')

Unnamed: 0,http_status,user_agent
Firefox,200,
Chrome,200,
Safari,404,
IE10,404,
Konqueror,301,


In [30]:
date_index = pd.date_range('1/1/2010', periods=6, freq='D')
date_index

DatetimeIndex(['2010-01-01', '2010-01-02', '2010-01-03', '2010-01-04',
               '2010-01-05', '2010-01-06'],
              dtype='datetime64[ns]', freq='D')

In [31]:
df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},index=date_index)
df2

Unnamed: 0,prices
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0


In [32]:
date_index = pd.date_range('12/29/2009', periods=10, freq='D')
df2.reindex(date_index)

Unnamed: 0,prices
2009-12-29,
2009-12-30,
2009-12-31,
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0
2010-01-07,


In [33]:
df2.reindex(date_index, method='bfill')

Unnamed: 0,prices
2009-12-29,100.0
2009-12-30,100.0
2009-12-31,100.0
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0
2010-01-07,


### Indexing, Selection, and Filtering

In [34]:
data = pd.DataFrame(np.arange(40).reshape((10, 4)),
    index=['Ohio', 'Colorado', 'Washington','Nebraska','Utah', 'New York','California', 'Texas', 'Georgia', 'Alaska'],
    columns=['Jan', 'Feb', 'Mar', 'Apr'])
data

Unnamed: 0,Jan,Feb,Mar,Apr
Ohio,0,1,2,3
Colorado,4,5,6,7
Washington,8,9,10,11
Nebraska,12,13,14,15
Utah,16,17,18,19
New York,20,21,22,23
California,24,25,26,27
Texas,28,29,30,31
Georgia,32,33,34,35
Alaska,36,37,38,39


In [35]:
data['Utah':'Texas']

Unnamed: 0,Jan,Feb,Mar,Apr
Utah,16,17,18,19
New York,20,21,22,23
California,24,25,26,27
Texas,28,29,30,31


In [36]:
# Slicing Subsets of Rows and Columns either by label index 
# or by integer indexing is not possible, we have some other sol

# data[2:6,0:2]   

We can select specific ranges of our data in both the row and column directions using either label or integer-based indexing.

<b>loc</b> is primarily label based indexing. Integers may be used but they are interpreted as a label.

<b>iloc</b> is primarily integer based indexing
To select a subset of rows and columns from our DataFrame, we can use the iloc method.

In [37]:
# use if loc (label based)

data.loc["Utah":"Texas", "Jan":'Mar']

Unnamed: 0,Jan,Feb,Mar
Utah,16,17,18
New York,20,21,22
California,24,25,26
Texas,28,29,30


In [38]:
#use if iloc (integer based)

data.iloc[2:6,0:2] 

Unnamed: 0,Jan,Feb
Washington,8,9
Nebraska,12,13
Utah,16,17
New York,20,21


In [39]:
a = pd.DataFrame({"p":[2,4,6]})
a

Unnamed: 0,p
0,2
1,4
2,6


In [40]:
a.rdiv(2)   # 2/6

Unnamed: 0,p
0,1.0
1,0.5
2,0.333333


In [41]:
data[data['Mar'] > 20]

Unnamed: 0,Jan,Feb,Mar,Apr
New York,20,21,22,23
California,24,25,26,27
Texas,28,29,30,31
Georgia,32,33,34,35
Alaska,36,37,38,39


In [42]:
data[data < 5] = 0
data

Unnamed: 0,Jan,Feb,Mar,Apr
Ohio,0,0,0,0
Colorado,0,5,6,7
Washington,8,9,10,11
Nebraska,12,13,14,15
Utah,16,17,18,19
New York,20,21,22,23
California,24,25,26,27
Texas,28,29,30,31
Georgia,32,33,34,35
Alaska,36,37,38,39


### Function Application and Mapping

In [43]:
frame = np.abs(
               pd.DataFrame(
                      np.random.randn(4, 3),
                      columns=list('bde'),
                      index=['Utah', 'Ohio', 'Texas', 'Oregon']))
frame

Unnamed: 0,b,d,e
Utah,1.801143,0.070101,1.015789
Ohio,0.323624,0.17335,1.369512
Texas,1.04372,1.362954,1.880287
Oregon,0.033027,0.552158,0.936923


In [44]:
# subtract the min value of each col from max of each col
f = lambda x: x.max() - x.min()

In [45]:
frame.apply(f,axis='rows')  # row or 0 for each row wise 

b    1.768116
d    1.292853
e    0.943364
dtype: float64

In [46]:
frame.apply(f,axis='columns')  # row or 0 for each row wise 

Utah      1.731042
Ohio      1.196162
Texas     0.836567
Oregon    0.903896
dtype: float64

### Application of Lambda Functions with Different Functions

Created a random dataset that contains information about a family of 5 people with their id, names, ages, and income per month. I will be using this dataframe to show you how to apply lambda functions using different functions on a dataframe in Python.

In [47]:
df=pd.DataFrame({
                'id':[1,2,3,4,5],
                'name':['Asad','Saad','Numi','Roman','Maria'],
                'age':[20,25,15,10,30],
                'income':[4000,7000,200,0,10000]})
df

Unnamed: 0,id,name,age,income
0,1,Asad,20,4000
1,2,Saad,25,7000
2,3,Numi,15,200
3,4,Roman,10,0
4,5,Maria,30,10000


### Application of Lambda with Apply

Let’s say we have got an error in the age variable. We recorded ages with a difference of 3 years. So, to remove this error from the Pandas dataframe, we have to add three years to every person’s age. We can do this with the <b>apply() function</b> in Pandas.

<b>apply() function</b>  in Pandas calls the lambda function and applies it to every row or column of the dataframe and returns a modified copy of the dataframe:

In [48]:
df['age'] = df.apply(lambda x: x['age']+3, axis='columns')  # on frame

In [49]:
df

Unnamed: 0,id,name,age,income
0,1,Asad,23,4000
1,2,Saad,28,7000
2,3,Numi,18,200
3,4,Roman,13,0
4,5,Maria,33,10000


In [50]:
df['age']=df['age'].apply(lambda x: x+3) #on particular series

In [51]:
df

Unnamed: 0,id,name,age,income
0,1,Asad,26,4000
1,2,Saad,31,7000
2,3,Numi,21,200
3,4,Roman,16,0
4,5,Maria,36,10000


# Application of Lambda with Filter

Now, let’s see how many of these people are above the age of 18.

We can do this using the <b>filter() function</b>. 

The <b>filter() function</b> takes a lambda function and a Pandas series and applies the lambda function on the series and filters the data.

In [52]:
a = filter(lambda x: x>8, df['age']) # returns generator

In [53]:
a

<filter at 0x7f90f8c8e760>

In [54]:
next(a)

26

In [55]:
list(filter(lambda x: x>8, df['age'])) # returns generator

[26, 31, 21, 16, 36]

### Application of Lambda with Map

You’ll be able to relate to the next statement. 🙂 It’s performance appraisal time and the income of all the employees gets increased by 20%. This means we have to increase the salary of each person by 20% in our Pandas dataframe.

We can do this using the map() function. This map() function maps the series according to input correspondence. It is very helpful when we have to substitute a series with other values.

In [56]:
df['income'] = list(map(lambda x: int(x+x*0.2), df['income']))

In [57]:
df

Unnamed: 0,id,name,age,income
0,1,Asad,26,4800
1,2,Saad,31,8400
2,3,Numi,21,240
3,4,Roman,16,0
4,5,Maria,36,12000


In [58]:
df['income2'] = df['income'].apply(lambda x: x+x*.2)

In [59]:
df

Unnamed: 0,id,name,age,income,income2
0,1,Asad,26,4800,5760.0
1,2,Saad,31,8400,10080.0
2,3,Numi,21,240,288.0
3,4,Roman,16,0,0.0
4,5,Maria,36,12000,14400.0


### Conditional Statements using Lambda Functions

Lambda functions also support conditional statements, such as if..else. This makes lambda functions very powerful.

Let’s say in the family dataframe we have to categorize people into ‘Adult’ or ‘Child’. For this, we can simply apply the lambda function to our dataframe:

In [60]:
df['category']=df['age'].apply(lambda x: 'Adult' if x>=18 else 'Child')
df

Unnamed: 0,id,name,age,income,income2,category
0,1,Asad,26,4800,5760.0,Adult
1,2,Saad,31,8400,10080.0,Adult
2,3,Numi,21,240,288.0,Adult
3,4,Roman,16,0,0.0,Child
4,5,Maria,36,12000,14400.0,Adult


### Lambda with Reduce

Now, let’s see the total income of the family. To calculate this, we can use the reduce() function in Python. It is used to apply a particular function to the list of elements in the sequence. The reduce() function is defined in the ‘functools’ module.

For using the reduce() function, we have to import the functools module first:

In [61]:
import functools

functools.reduce(lambda a,b: a+b, df['income'])

25440

### Correlation and Covariance

study link:https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/

In [62]:
# !pip install pandas_datareader

In [63]:
import pandas_datareader.data as web

In [64]:
# dictionary comprehension

all_data = {
    ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']
}

In [65]:
all_data['AAPL']

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-03-09,25.395000,25.067499,25.327499,25.280001,108806800.0,23.513155
2016-03-10,25.559999,25.037500,25.352501,25.292500,134054400.0,23.524775
2016-03-11,25.570000,25.375000,25.559999,25.565001,109632800.0,23.778234
2016-03-14,25.727501,25.445000,25.477501,25.629999,100304400.0,23.838688
2016-03-15,26.295000,25.962500,25.990000,26.145000,160270800.0,24.317701
...,...,...,...,...,...,...
2021-03-03,125.709999,121.839996,124.809998,122.059998,112430400.0,122.059998
2021-03-04,123.599998,118.620003,121.750000,120.129997,177275300.0,120.129997
2021-03-05,121.940002,117.570000,120.980003,121.419998,153590400.0,121.419998
2021-03-08,121.000000,116.209999,120.930000,116.360001,153918600.0,116.360001
