In [1]:
import pandas as pd
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()

In [2]:
# Create dataframe from csv
austin_weather = pd.read_csv('weather_csvs/austin_weather.csv')

# Keep only the rows with code type 'SOD' (indicating daily weather data) in the 'REPORT_TYPE' column
austin_weather = austin_weather.loc[austin_weather['REPORT_TYPE'].str.strip() == 'SOD']

# Drop all columns that have all NaN values
austin_weather.dropna(axis=1, how='all', inplace=True)

# Add column that clarifies the city name
austin_weather['city'] = 'Austin, TX'

# From the remaining columns, keep the ones specified
austin_weather = austin_weather[['STATION', 'DATE', 'city', 'DailyAverageDryBulbTemperature', 'DailyAverageRelativeHumidity', 
                          'DailyAverageStationPressure', 'DailyAverageWindSpeed', 'DailyMaximumDryBulbTemperature', 
                          'DailyMinimumDryBulbTemperature', 'DailyPeakWindDirection', 'DailyPrecipitation']]

# Rename columns, reset index
austin_weather.columns = ['station', 'date', 'city', 'avg_temp', 'avg_rel_humidity', 'avg_pressure', 'avg_wind', 'max_temp',
                         'min_temp', 'peak_wind_dir', 'precip']

austin_weather.reset_index(drop=True, inplace=True)

# Clean the 'date' column by keeping only the date ('YYYY-MM-DD') and dropping extra info (time entered)
austin_weather['date'] = [austin_weather['date'][x][:10] for x in range(len(austin_weather['date']))]

# Instantiate list of dates
austin_dates = ['2012-11-18', '2013-11-17', '2014-11-02', '2015-10-25', '2016-10-23', '2017-10-22', '2018-10-21']

# Keep only rows whose date is in the list austin_dates and reset the index again
austin_weather = austin_weather.loc[austin_weather['date'].isin(austin_dates)]

austin_weather.reset_index(drop=True, inplace=True)

# Show dataframe
austin_weather

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,station,date,city,avg_temp,avg_rel_humidity,avg_pressure,avg_wind,max_temp,min_temp,peak_wind_dir,precip
0,72254013904,2012-11-18,"Austin, TX",55.0,65.0,29.7,4.6,74.0,36.0,160.0,0.0
1,72254013904,2013-11-17,"Austin, TX",75.0,78.0,29.33,5.0,89.0,61.0,200.0,0.0
2,72254013904,2014-11-02,"Austin, TX",57.0,64.0,29.68,8.6,76.0,38.0,150.0,0.0
3,72254013904,2015-10-25,"Austin, TX",62.0,88.0,29.47,16.7,65.0,58.0,350.0,0.57
4,72254013904,2016-10-23,"Austin, TX",66.0,68.0,29.62,5.6,84.0,47.0,180.0,0.01
5,72254013904,2017-10-22,"Austin, TX",64.0,81.0,29.57,8.2,78.0,50.0,330.0,0.7
6,72254013904,2018-10-21,"Austin, TX",65.0,61.0,29.7,9.6,74.0,55.0,20.0,0.0


In [3]:
# Create dataframe from csv
indy_weather = pd.read_csv('weather_csvs/indy_weather.csv')

# Keep only the rows with code type 'SOD' (indicating daily weather data) in the 'REPORT_TYPE' column
indy_weather = indy_weather.loc[indy_weather['REPORT_TYPE'].str.strip() == 'SOD']

# Drop all columns that have all NaN values
indy_weather.dropna(axis=1, how='all', inplace=True)

# Add column that clarifies the city name
indy_weather['city'] = 'Indianapolis, IN'

# From the remaining columns, keep the ones specified
indy_weather = indy_weather[['STATION', 'DATE', 'city', 'DailyAverageDryBulbTemperature', 'DailyAverageRelativeHumidity', 
                          'DailyAverageStationPressure', 'DailyAverageWindSpeed', 'DailyMaximumDryBulbTemperature', 
                          'DailyMinimumDryBulbTemperature', 'DailyPeakWindDirection', 'DailyPrecipitation']]

# Rename columns, reset index
indy_weather.columns = ['station', 'date', 'city', 'avg_temp', 'avg_rel_humidity', 'avg_pressure', 'avg_wind', 'max_temp',
                         'min_temp', 'peak_wind_dir', 'precip']

indy_weather.reset_index(drop=True, inplace=True)

# Clean the 'date' column by keeping only the date ('YYYY-MM-DD') and dropping extra info (time entered)
indy_weather['date'] = [indy_weather['date'][x][:10] for x in range(len(indy_weather['date']))]

# Instantiate list of dates
indy_dates = ['2000-09-24', '2001-09-30', '2002-09-29', '2003-09-28', '2004-06-20', '2005-06-19', 
              '2006-07-02', '2007-06-17']

# Keep only rows whose date is in the list austin_dates and reset the index again
indy_weather = indy_weather.loc[indy_weather['date'].isin(indy_dates)]

indy_weather.reset_index(drop=True, inplace=True)

# Show dataframe
indy_weather

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,station,date,city,avg_temp,avg_rel_humidity,avg_pressure,avg_wind,max_temp,min_temp,peak_wind_dir,precip
0,72438093819,2000-09-24,"Indianapolis, IN",59,,29.08,10.9,63,55,330,0.23
1,72438093819,2001-09-30,"Indianapolis, IN",59,,29.34,6.8,72,46,360,0.00s
2,72438093819,2002-09-29,"Indianapolis, IN",69,,29.24,6.8,80,57,190,0.00s
3,72438093819,2003-09-28,"Indianapolis, IN",50,,29.05,8.9,56,44,320,0.07
4,72438093819,2004-06-20,"Indianapolis, IN",63,,29.21,6.0,75,51,250,0.00s
5,72438093819,2005-06-19,"Indianapolis, IN",67,,29.26,8.1,77,57,40,0.00
6,72438093819,2006-07-02,"Indianapolis, IN",80,63.0,29.21,10.0,90,70,270,0.00
7,72438093819,2007-06-17,"Indianapolis, IN",82,49.0,29.15,7.8,93,71,260,T


In [8]:
# Save dataframe to a csv file
# indy_weather.to_csv('weather_csvs/weather_data.csv', index=False)

In [9]:
# Append the other dataframe to the weather_data csv
# with open('weather_csvs/weather_data.csv', 'a') as file:
#     austin_weather.to_csv(file, header=False, index=False)

In [10]:
# Create dataframe from the weather_data csv
combined_weather = pd.read_csv('weather_csvs/weather_data.csv')

combined_weather

Unnamed: 0,station,date,city,avg_temp,avg_rel_humidity,avg_pressure,avg_wind,max_temp,min_temp,peak_wind_dir,precip
0,72438093819,9/24/2000,"Indianapolis, IN",59,,29.08,10.9,63,55,330,0.23
1,72438093819,9/30/2001,"Indianapolis, IN",59,,29.34,6.8,72,46,360,0.00s
2,72438093819,9/29/2002,"Indianapolis, IN",69,,29.24,6.8,80,57,190,0.00s
3,72438093819,9/28/2003,"Indianapolis, IN",50,,29.05,8.9,56,44,320,0.07
4,72438093819,6/20/2004,"Indianapolis, IN",63,,29.21,6.0,75,51,250,0.00s
5,72438093819,6/19/2005,"Indianapolis, IN",67,,29.26,8.1,77,57,40,0
6,72438093819,7/2/2006,"Indianapolis, IN",80,63.0,29.21,10.0,90,70,270,0
7,72438093819,6/17/2007,"Indianapolis, IN",82,49.0,29.15,7.8,93,71,260,T
8,72254013904,11/18/2012,"Austin, TX",55,65.0,29.7,4.6,74,36,160,0
9,72254013904,11/17/2013,"Austin, TX",75,78.0,29.33,5.0,89,61,200,0
