# Pandas 2

In [54]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_excel('weather_data.xlsx')

In [3]:
df.head()

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,35,7,Sunny
2,2017-01-03,28,2,Snow
3,2017-01-04,24,7,Snow
4,2017-01-05,32,4,Rain


In [4]:
# conditional selection of data 

# print all rows when windspeed>2

df[df.windspeed>2]

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,35,7,Sunny
3,2017-01-04,24,7,Snow
4,2017-01-05,32,4,Rain


In [5]:
# print row when windspeed was maximum

df[df.windspeed==df.windspeed.max()]

Unnamed: 0,day,temperature,windspeed,event
1,2017-01-02,35,7,Sunny
3,2017-01-04,24,7,Snow


In [6]:
df.index                   # bydefault index is set

RangeIndex(start=0, stop=6, step=1)

In [7]:
df.set_index('day', inplace=True)        # to set index according to the requirement

In [8]:
df                                       # the changes are made to the actual data frame

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,35,7,Sunny
2017-01-03,28,2,Snow
2017-01-04,24,7,Snow
2017-01-05,32,4,Rain
2017-01-06,31,2,Sunny


In [9]:
df.iloc[0:3,0:]

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,35,7,Sunny
2017-01-03,28,2,Snow


In [10]:
df.loc['2017-01-04']              # you can now use day as index to retrieve the row

temperature       24
windspeed          7
event           Snow
Name: 2017-01-04 00:00:00, dtype: object

In [15]:
df0=pd.read_excel('stock_prices.xlsx',skiprows=1)        # this will skip the first row
df0.head()

Unnamed: 0,GOOGL,27.82,87,845,larry page
0,WMT,4.61,484,65,n.a.
1,MSFT,-1,85,64,bill gates
2,RIL,not available,50,1023,mukesh ambani
3,TATA,5.6,-1,n.a.,ratan tata


In [18]:
df0=pd.read_excel('stock_prices.xlsx',header=1)        # this will set the given row as header
df0.head()

Unnamed: 0,GOOGL,27.82,87,845,larry page
0,WMT,4.61,484,65,n.a.
1,MSFT,-1,85,64,bill gates
2,RIL,not available,50,1023,mukesh ambani
3,TATA,5.6,-1,n.a.,ratan tata


In [59]:
df0=pd.read_excel('stock_prices.xlsx',na_values=['not available','n.a.'])     # the values irrelevent will be replaced by NaN(not a number)
df0.head()

Unnamed: 0,Tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845.0,larry page
1,WMT,4.61,484,65.0,
2,MSFT,-1.0,85,64.0,bill gates
3,RIL,,50,1023.0,mukesh ambani
4,TATA,5.6,-1,,ratan tata


In [61]:
# writing multiple data frames to different excel sheets

with pd.ExcelWriter('stocks_weather.xlsx') as writer:
    df0.to_excel(writer, sheet_name="stocks")
    df.to_excel(writer, sheet_name="weather")

In [67]:
# handling missing data

df= pd.read_excel('stocks_weather.xlsx',sheet_name='weather')
df.set_index('day',inplace=True)
df


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-02,,9.0,Sunny
2017-01-03,28.0,,Snow
2017-01-04,,7.0,
2017-01-05,32.0,,Rain
2017-01-06,,,Sunny
2017-01-07,,,
2017-01-08,,,
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy


In [68]:
# fillna method

new_df= df.fillna(0)         # will replace the NaN values with 0
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-02,0.0,9.0,Sunny
2017-01-03,28.0,0.0,Snow
2017-01-04,0.0,7.0,0
2017-01-05,32.0,0.0,Rain
2017-01-06,0.0,0.0,Sunny
2017-01-07,0.0,0.0,0
2017-01-08,0.0,0.0,0
2017-01-09,0.0,0.0,0
2017-01-10,34.0,8.0,Cloudy


In [72]:
# to fill different values in different columns you create a dictionary

new_df= df.fillna({
    'temperature ':0,
    'windspeed':0,
    'event':'no event'
})

new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-02,0.0,9.0,Sunny
2017-01-03,28.0,0.0,Snow
2017-01-04,0.0,7.0,no event
2017-01-05,32.0,0.0,Rain
2017-01-06,0.0,0.0,Sunny
2017-01-07,0.0,0.0,no event
2017-01-08,0.0,0.0,no event
2017-01-09,0.0,0.0,no event
2017-01-10,34.0,8.0,Cloudy


In [73]:
new_df= df.fillna(method='ffill')                  # this will fill the NaN with the value present in the prior cell(bfill works complete opposite)
new_df


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-02,32.0,9.0,Sunny
2017-01-03,28.0,9.0,Snow
2017-01-04,28.0,7.0,Snow
2017-01-05,32.0,7.0,Rain
2017-01-06,32.0,7.0,Sunny
2017-01-07,32.0,7.0,Sunny
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy


In [79]:
new_df=df.interpolate()                     # it will fill the mean values between the cell
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-02,30.0,9.0,Sunny
2017-01-03,28.0,8.0,Snow
2017-01-04,30.0,7.0,
2017-01-05,32.0,7.166667,Rain
2017-01-06,32.4,7.333333,Sunny
2017-01-07,32.8,7.5,
2017-01-08,33.2,7.666667,
2017-01-09,33.6,7.833333,
2017-01-10,34.0,8.0,Cloudy


In [82]:
# dropna

new_df=df.dropna()                   # all NaN values will be dropped
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [85]:
# Group By 

df= pd.read_excel('stocks_weather.xlsx',sheet_name='weather')
df.set_index('day',inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event,City
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,32,6,Rain,Mumbai
2017-01-02,30,9,Sunny,Guwahati
2017-01-03,28,6,Snow,Kullu
2017-01-04,25,7,Rain,Mumbai
2017-01-05,32,5,Rain,Kullu
2017-01-06,25,2,Sunny,Guwahati
2017-01-07,26,3,Snow,Kullu
2017-01-08,22,6,Sunny,Guwahati
2017-01-09,26,2,Cloudy,Mumbai
2017-01-10,34,8,Cloudy,Mumbai


In [89]:
g= df.groupby('City')                  # will group the data w.r.t. city

In [92]:
g.get_group('Mumbai')                  # to get data of Mumbai (where mumbai acts as a key) (similar to sql group by)

Unnamed: 0_level_0,temperature,windspeed,event,City
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,32,6,Rain,Mumbai
2017-01-04,25,7,Rain,Mumbai
2017-01-09,26,2,Cloudy,Mumbai
2017-01-10,34,8,Cloudy,Mumbai


In [93]:
for city,city_df in g:
    print(city)
    print(city_df)

Guwahati
            temperature   windspeed  event      City
day                                                 
2017-01-02            30          9  Sunny  Guwahati
2017-01-06            25          2  Sunny  Guwahati
2017-01-08            22          6  Sunny  Guwahati
2017-01-11            40         12  Sunny  Guwahati
Kullu
            temperature   windspeed event   City
day                                             
2017-01-03            28          6  Snow  Kullu
2017-01-05            32          5  Rain  Kullu
2017-01-07            26          3  Snow  Kullu
Mumbai
            temperature   windspeed   event    City
day                                                
2017-01-01            32          6    Rain  Mumbai
2017-01-04            25          7    Rain  Mumbai
2017-01-09            26          2  Cloudy  Mumbai
2017-01-10            34          8  Cloudy  Mumbai


In [94]:
# to get max temperature in each city

g.max()

Unnamed: 0_level_0,temperature,windspeed,event
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Guwahati,40,12,Sunny
Kullu,32,6,Snow
Mumbai,34,8,Rain


In [95]:
# to get average

g.mean()

Unnamed: 0_level_0,temperature,windspeed
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Guwahati,29.25,7.25
Kullu,28.666667,4.666667
Mumbai,29.25,5.75


In [96]:
# all the statistics at once

g.describe()

Unnamed: 0_level_0,temperature,temperature,temperature,temperature,temperature,temperature,temperature,temperature,windspeed,windspeed,windspeed,windspeed,windspeed,windspeed,windspeed,windspeed
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
City,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Guwahati,4.0,29.25,7.889867,22.0,24.25,27.5,32.5,40.0,4.0,7.25,4.272002,2.0,5.0,7.5,9.75,12.0
Kullu,3.0,28.666667,3.05505,26.0,27.0,28.0,30.0,32.0,3.0,4.666667,1.527525,3.0,4.0,5.0,5.5,6.0
Mumbai,4.0,29.25,4.425306,25.0,25.75,29.0,32.5,34.0,4.0,5.75,2.629956,2.0,5.0,6.5,7.25,8.0


In [98]:
# Merging two Dataframes

df1=pd.DataFrame({
    "city":['newyork','chicago','orlando'],
    "temperature":[21,14,35]
})

df1



Unnamed: 0,city,temperature
0,newyork,21
1,chicago,14
2,orlando,35


In [99]:
df2=pd.DataFrame({
    "city":['newyork','chicago','orlando'],
    "humidity":[68,65,75]
})

df2

Unnamed: 0,city,humidity
0,newyork,68
1,chicago,65
2,orlando,75


In [102]:
df3=pd.merge(df1,df2,on="city")               # will merge the 2 data frames with city as key

In [103]:
df3

Unnamed: 0,city,temperature,humidity
0,newyork,21,68
1,chicago,14,65
2,orlando,35,75


In [104]:
df1=pd.DataFrame({
    "city":['newyork','chicago','orlando','san francisco'],
    "temperature":[21,14,35,32]
})

df2=pd.DataFrame({
    "city":['newyork','chicago','orlando','baltomore'],
    "humidity":[68,65,75,62]
})


In [106]:
df3=pd.merge(df1,df2,on="city")              # it will only print the common cities (intersection or inner join)
df3

Unnamed: 0,city,temperature,humidity
0,newyork,21,68
1,chicago,14,65
2,orlando,35,75


In [110]:
df3=pd.merge(df1,df2,on="city",how="outer",indicator=True)              # outer-join
df3

Unnamed: 0,city,temperature,humidity,_merge
0,newyork,21.0,68.0,both
1,chicago,14.0,65.0,both
2,orlando,35.0,75.0,both
3,san francisco,32.0,,left_only
4,baltomore,,62.0,right_only


In [108]:
df3=pd.merge(df1,df2,on="city",how="left")              # will take all common plus the remaining elements in left table
df3

Unnamed: 0,city,temperature,humidity
0,newyork,21,68.0
1,chicago,14,65.0
2,orlando,35,75.0
3,san francisco,32,


In [109]:
df3=pd.merge(df1,df2,on="city",how="right")              # will take all common plus the remaining elements in right table
df3

Unnamed: 0,city,temperature,humidity
0,newyork,21.0,68
1,chicago,14.0,65
2,orlando,35.0,75
3,baltomore,,62
