## Loading a table

In [None]:
import pandas as pd
w = pd.read_csv('weather.csv')
w.head()

In [None]:
w.tail()

## Data types

In [None]:
type(w)

In [None]:
type(w['date'])

In [None]:
type(w['date'][0])

## Hands-On Step 1
1. Read `data.csv` into a data frame named `data`. 
2. Print its first and last rows. 
3. Explore the data types of `Serial No.` and `GRE Score` columns.
4. Use `data.describe()` to get further information about the data.

## Running Aggregate Queries

In [None]:
w['temp'].max()

In [None]:
print('During this period:')
print('Maximum temperature was %.2f degrees.' % (w['temp'].max()))
print('Minimum temperature was %.2f degrees.' % (w['temp'].min()))
print('Average temperature was %.2f degrees.' % (w['temp'].mean()))
print('Total percipitation was %.0f mm.'      % (w['percipitation'].sum()))

## Hands-On Step 2
Find out the number of applicants having `Research` field `1`.

## Handling Missing Data

In [None]:
w.isna()

In [None]:
w.isna().sum()

In [None]:
w.isnull().any(axis=1)

In [None]:
w[w.isna().any(axis=1)]

In [None]:
w.describe()

In [None]:
w.fillna({'temp': w['temp'].mean(), 'visibility': 6.0}, inplace=True)

In [None]:
w.isna().sum()

In [None]:
w.describe()

## Hands-On Step 3
1. On `data` data frame, use `isna()` and other method to obtain information about missing values.
2. Replace all missing values with the mean value of the corresponding column.
3. Make sure no NA value is left in the data.

## Adding a Column

In [None]:
# It is better to use date+time as index
# add a new column 'datetime' to aggregate date and time
w['datetime']= w['date'] + ' ' + w['time'].map(str) + ':00:00'

# note that the type of this column is string. We need to change the type to timestamp.
print(type(w['datetime'][0]))

w[0:5]

Using `pd.to_datetime`, change the type of the `datetime` column to Timestamp.

In [None]:
# Change the type to timestamp
#w['datetime']=pd.to_datetime(w['datetime'],infer_datetime_format=True)
w['datetime']=pd.to_datetime(w['datetime'],format='%d-%m-%Y %H:%M:%S')

# note that the type is changed to timestamp
print(type(w['datetime'][0]))

w[0:5]

## Hands-On Step 4
Create a new column named `Subj`, whose values are `SOP*0.3 + LOR*0.7`.
**You will face an error while doing it!** 

Try to fix it :-)

## Changing the Index
Set the newly added column as the index of `w`.

In [None]:
# Let's set datetime as index
if 'datetime' in w.columns:
    w = w.set_index('datetime')
# If you run this block a second time, 'datetime' is removed.
# So we need this if statement to make sure we don't run into error.
w[0:10]

In [None]:
# We want to sort values increasingly by date
# note that we need to first sort by date, and then by time.
w=w.sort_values(['date','time'])
w[0:10]
# note that the order of index has changed, but the values are not changed

# also note that since 'date' is a string, the lexicographic order
# in string representation is not necessarily the same as in date
# representation. Therefore this order is not correct.

In [None]:
# In order to get the correct order, we can directly sort index which
# is the best representation of date and time.
w=w.sort_index()
w[0:10]
# This ordering is correct. Correct ordering sometimes matter,
# specially when you want to draw curves.

## Hands-On Step 5
1. Set `Serial No.` as the index of `data`. Make sure running the cell twice does not cause error.
2. Sort the data based on the descending order of University Rating, and then CGPA.

# Drawing Figures

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(w['temp'])
# Note that x-axis is automatically set to index

In [None]:
# We need to make the plot larger
plt.figure(figsize=(10,6))
plt.plot(w['temp'])
# Show grid
plt.grid()


In [None]:
plt.figure(figsize=(10,6))
plt.plot(w['az'])
# Azimuth angle of wind direction
# not a good figure

In [None]:
plt.figure(figsize=(10,6))
plt.plot(w['visibility'])

In [None]:
plt.figure(figsize=(10,6))
plt.plot(w['percipitation'])
# note that all of these figures have a lot of problems.

## Hands-On Step 6
Using `plt.scatter`, draw a scatter plot of GRE Score against CGPA

## Groupby 

In [None]:
# If we want to draw daily figures, we need to use groupby

wg=w.groupby('date')
type(wg)

In [None]:
daily=pd.DataFrame()
daily['max_temp']=wg['temp'].max()
daily['min_temp']=wg['temp'].min()
daily['avg_temp']=wg['temp'].mean()
daily['percipitation']=wg['percipitation'].sum()
daily.index=pd.to_datetime(daily.index,format='%d-%m-%Y')
daily=daily.sort_index()
daily

## Hands-On Step 7
Create a summary data frame in which you store average CGPA, average GRE Score, and the number of Research applicants, against University Rating.

## More visualizations

In [None]:
# Here is a simple plot comparing the minimum and maximum daily temperature values.
plt.plot(daily['max_temp'],'r')
plt.plot(daily['min_temp'],'b')

In [None]:
# We can format this figure
plt.figure(figsize=(10,6))
plt.plot(daily['max_temp'],'r')
plt.plot(daily['min_temp'],'b')
plt.grid()
plt.title('Min and Max daily temperature in Tehran')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
#plt.bar(daily.index,daily['percipitation'])

In [None]:
# We can add new plots to this chart
plt.figure(figsize=(10,6))
plt.plot(daily['max_temp'],'r')
plt.plot(daily['min_temp'],'b')
plt.grid()
plt.title('Min and Max daily temperature in Tehran')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.bar(daily.index,daily['percipitation'])

## Hands-On Step 8
1. Draw a plot of Average GRE Score against University Rating.
2. Add a plot of Research applicants per University Rating to the plot.
3. Make the plots more comprehensible.