# Dataframe Basics

DataFrame is used to represent tabular data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('weather_data.csv')
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


#### Representing dimension
6 rows, 4 columns

In [3]:
df.shape   

(6, 4)

#### Dataframe slicing and other basic operations

In [4]:
df[2:5]             # include 2 not 5

Unnamed: 0,day,temperature,windspeed,event
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain


In [5]:
df[:]          #for printing everything

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [7]:
df.columns         # print all the columns in our dataset

Index(['day', 'temperature', 'windspeed', 'event'], dtype='object')

In [11]:
df.temperature    # df['temperature']  both are same

0    32
1    35
2    28
3    24
4    32
5    31
Name: temperature, dtype: int64

In [13]:
type(df['event'])  # col in df are of types 

pandas.core.series.Series

In [15]:
df[['event','day']]  # printing few columns of our like

Unnamed: 0,event,day
0,Rain,1/1/2017
1,Sunny,1/2/2017
2,Snow,1/3/2017
3,Snow,1/4/2017
4,Rain,1/5/2017
5,Sunny,1/6/2017


#### Applying functions on our df

In [16]:
df['temperature'].max()   #showing max of that table

35

In [17]:
df['temperature'].mean() #calculating mean of that table

30.333333333333332

In [18]:
df['temperature'].min()

24

In [19]:
df['temperature'].std()

3.8297084310253524

In [20]:
df.describe()   # showing statistics of our dataset

Unnamed: 0,temperature,windspeed
count,6.0,6.0
mean,30.333333,4.666667
std,3.829708,2.33809
min,24.0,2.0
25%,28.75,2.5
50%,31.5,5.0
75%,32.0,6.75
max,35.0,7.0


#### SQL operations with DF

In [21]:
df[df.temperature>=32]  #printing those rows having the temperature >=32

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
4,1/5/2017,32,4,Rain


In [22]:
df[df.temperature == df.temperature.max()] # printing the maximum temperature

Unnamed: 0,day,temperature,windspeed,event
1,1/2/2017,35,7,Sunny


In [23]:
df['day'][df.temperature == df['temperature'].max()]  #print day only when the temperature was maximum

1    1/2/2017
Name: day, dtype: object

In [25]:
df[['day','temperature']][df.temperature == df['temperature'].max()] # print day and temperature when temperature on that day was max

Unnamed: 0,day,temperature
1,1/2/2017,35


Check on this for more operations like this with pandas
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html

#### Working with Index

In [34]:
df.index

Index(['1/1/2017', '1/2/2017', '1/3/2017', '1/4/2017', '1/5/2017', '1/6/2017'], dtype='object', name='day')

In [35]:
df.set_index('day',inplace=True) # set index to day

KeyError: "None of ['day'] are in the columns"

In [36]:
df.loc['1/2/2017']

temperature       35
windspeed          7
event          Sunny
Name: 1/2/2017, dtype: object

In [37]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32,6,Rain
1/2/2017,35,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow
1/5/2017,32,4,Rain
1/6/2017,31,2,Sunny


In [40]:
df.reset_index(inplace=True)   # reset index to it's original value

In [39]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny
