In [1]:
# Import libraries

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer

from google.colab import drive
drive.mount('/content/drive')

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Import datasets

train = pd.read_csv('./drive/My Drive/dsi14_p4/input/train.csv', parse_dates=['Date'])
test = pd.read_csv('./drive/My Drive/dsi14_p4/input/test.csv', parse_dates=['Date'])
spray = pd.read_csv('./drive/My Drive/dsi14_p4/input/spray.csv', parse_dates=['Date'])
weather = pd.read_csv('./drive/My Drive/dsi14_p4/input/weather.csv', parse_dates=['Date'])

In [0]:
# Define a function to understand the data better

def eda(df, df_name):
    print(df_name.capitalize())
    print()
    print(f"Rows: {df.shape[0]} \t Columns: {df.shape[1]}")
    print()
          
    print(f"Number of Missing rows: {df.isnull().sum().sum()}")
    print()
          
    print(f"Number of Duplicate rows: {df[df.duplicated(keep=False)].shape[0]}")
    print()
          
    print(df.dtypes)
    print("_________________________________________\n")

In [0]:
data = [(train, 'train'),
       (spray, 'spray'),
       (weather, 'weather'),
       (test,'test')]

In [5]:
# Summary

[eda(df, name) for df, name in data]

Train

Rows: 10506 	 Columns: 12

Number of Missing rows: 0

Number of Duplicate rows: 1062

Date                      datetime64[ns]
Address                           object
Species                           object
Block                              int64
Street                            object
Trap                              object
AddressNumberAndStreet            object
Latitude                         float64
Longitude                        float64
AddressAccuracy                    int64
NumMosquitos                       int64
WnvPresent                         int64
dtype: object
_________________________________________

Spray

Rows: 14835 	 Columns: 4

Number of Missing rows: 584

Number of Duplicate rows: 543

Date         datetime64[ns]
Time                 object
Latitude            float64
Longitude           float64
dtype: object
_________________________________________

Weather

Rows: 2944 	 Columns: 22

Number of Missing rows: 0

Number of Duplicate rows: 0

Stati

[None, None, None, None]

In [0]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [7]:
# Weather Information

print(weather['Date'].max())
print(weather['Date'].min())
print()
print(weather.shape)
print()
weather.info()

2014-10-31 00:00:00
2007-05-01 00:00:00

(2944, 22)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Station      2944 non-null   int64         
 1   Date         2944 non-null   datetime64[ns]
 2   Tmax         2944 non-null   int64         
 3   Tmin         2944 non-null   int64         
 4   Tavg         2944 non-null   object        
 5   Depart       2944 non-null   object        
 6   DewPoint     2944 non-null   int64         
 7   WetBulb      2944 non-null   object        
 8   Heat         2944 non-null   object        
 9   Cool         2944 non-null   object        
 10  Sunrise      2944 non-null   object        
 11  Sunset       2944 non-null   object        
 12  CodeSum      2944 non-null   object        
 13  Depth        2944 non-null   object        
 14  Water1       2944 non-null   object        
 15  Sn

In [0]:
# Drop Columns
weather.drop(['Water1','Depth','SnowFall'], axis=1, inplace=True)

In [0]:
# Replace 'T's and 'M's with 0s
weather.PrecipTotal.replace(['  T','M'],0,inplace=True)
weather.WetBulb.replace(['M'],0,inplace=True)
weather.Heat.replace(['M'],0,inplace=True)
weather.Cool.replace(['M'],0,inplace=True)
weather.StnPressure.replace(['M'],0,inplace=True)
weather.SeaLevel.replace(['M'],0,inplace=True)
weather.AvgSpeed.replace(['M'],0,inplace=True)

# Change from object to float
weather.PrecipTotal = weather.PrecipTotal.astype(float)
weather.WetBulb = weather.WetBulb.astype(float)
weather.Heat = weather.Heat.astype(float)
weather.Cool = weather.Cool.astype(float)
weather.StnPressure = weather.StnPressure.astype(float)
weather.SeaLevel = weather.SeaLevel.astype(float)
weather.AvgSpeed = weather.AvgSpeed.astype(float)

In [0]:
# Calculate Tavg using (Tmax + Tmin)/2 to deal with 'M's in Tavg
weather.Tavg = (weather.Tmax + weather.Tmin)/2

In [11]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,66.5,14,51,56.0,0.0,2.0,0448,1849,,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68.0,M,51,57.0,0.0,3.0,-,-,,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,50.5,-3,42,47.0,14.0,0.0,0447,1850,BR,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,51.5,M,42,47.0,13.0,0.0,-,-,BR HZ,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56.0,2,40,48.0,9.0,0.0,0446,1851,,0.0,29.39,30.12,11.7,7,11.9


In [0]:
# Groupby Station 1 and 2

weather_stations_combined = weather.groupby('Date').mean() 
weather_stations_combined.drop(['Station'], axis =1, inplace=True)

In [0]:
# Defining a function to convert 24h time to float number (for example, 0445 will be converted to 4.75).

def sun_time_converter(sun_time):
  
  # Getting the hour part of the time to convert to the whole number part in the float number.
  hours = sun_time // 100

  # Getting the minute part of the time to convert to the decimal part in the float number.
  minutes = (sun_time % 100) / 60

  return hours + minutes

In [0]:
# Creating series of sunrise and sunset times and converting them to dtype float.
sunrise_times = weather[weather['Station']==1]['Sunrise'].astype(float)
sunset_times = weather[weather['Station']==1]['Sunset'].astype(float)

# Mapping the above created function to convert 24h time values to floating point numbers.
weather_stations_combined['Sunrise'] = list(sunrise_times.map(sun_time_converter))
weather_stations_combined['Sunset'] = list(sunset_times.map(sun_time_converter))

In [15]:
weather_stations_combined.head()

Unnamed: 0_level_0,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Sunrise,Sunset
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2007-05-01,83.5,51.0,67.25,51.0,56.5,0.0,2.5,0.0,29.14,29.82,2.2,26.0,9.4,4.8,18.816667
2007-05-02,59.5,42.5,51.0,42.0,47.0,13.5,0.0,0.0,29.41,30.085,13.15,3.0,13.4,4.783333,18.833333
2007-05-03,66.5,47.0,56.75,40.0,49.0,8.0,0.0,0.0,29.425,30.12,12.3,6.5,12.55,4.766667,18.85
2007-05-04,72.0,50.0,61.0,41.5,50.0,3.5,0.0,0.0,29.335,30.045,10.25,7.5,10.6,4.733333,18.866667
2007-05-05,66.0,53.5,59.75,38.5,49.5,5.0,0.0,0.0,29.43,30.095,11.45,7.0,11.75,4.716667,18.883333


In [0]:
# weather = pd.read_csv('../datasets/weather.csv', parse_dates =['Date'])
# weather = pd.read_csv('./drive/My Drive/dsi14_p4/input/weather.csv', parse_dates=['Date'])

In [17]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,66.5,14,51,56.0,0.0,2.0,0448,1849,,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68.0,M,51,57.0,0.0,3.0,-,-,,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,50.5,-3,42,47.0,14.0,0.0,0447,1850,BR,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,51.5,M,42,47.0,13.0,0.0,-,-,BR HZ,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56.0,2,40,48.0,9.0,0.0,0446,1851,,0.0,29.39,30.12,11.7,7,11.9


In [0]:
weather['CodeSum'] = weather['CodeSum'].str.replace("BCFG","BC FG")
weather['CodeSum'] = weather['CodeSum'].str.replace("MIFG","MI FG")
weather['CodeSum'] = weather['CodeSum'].str.replace("TSRA","TS RA")
weather['CodeSum'] = weather['CodeSum'].str.replace("VCFG","VC FG")
weather['CodeSum'] = weather['CodeSum'].str.replace("VCTS","VC TS")

In [0]:
def clean_codesum(string):
  merge_stations = " ".join(string)
  split_strings = merge_stations.split()
  unique_strings = set(split_strings)
  return " ".join(unique_strings)

In [0]:
code_sum_combined = weather.groupby('Date')['CodeSum'].agg(clean_codesum)
code_sum_list = code_sum_combined.tolist()

In [21]:
cvec = CountVectorizer(analyzer = 'word',token_pattern=r'[\w\+]+')
code_sum_cvec = cvec.fit_transform(code_sum_list)
code_sum_df = pd.DataFrame(code_sum_cvec.toarray(),
                         columns=cvec.get_feature_names())
code_sum_df.head()

Unnamed: 0,bc,br,dz,fg,fg+,fu,gr,hz,mi,ra,sn,sq,ts,vc
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
code_sum_df.shape

(1472, 14)

In [23]:
weather_stations_combined= weather_stations_combined.reset_index()
weather_stations_combined.shape

(1472, 16)

In [24]:
cleaned_weather = pd.concat([weather_stations_combined, 
                             code_sum_df.reset_index(drop=True)],axis=1)
cleaned_weather.head()

Unnamed: 0,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Sunrise,Sunset,bc,br,dz,fg,fg+,fu,gr,hz,mi,ra,sn,sq,ts,vc
0,2007-05-01,83.5,51.0,67.25,51.0,56.5,0.0,2.5,0.0,29.14,29.82,2.2,26.0,9.4,4.8,18.816667,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2007-05-02,59.5,42.5,51.0,42.0,47.0,13.5,0.0,0.0,29.41,30.085,13.15,3.0,13.4,4.783333,18.833333,0,1,0,0,0,0,0,1,0,0,0,0,0,0
2,2007-05-03,66.5,47.0,56.75,40.0,49.0,8.0,0.0,0.0,29.425,30.12,12.3,6.5,12.55,4.766667,18.85,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,2007-05-04,72.0,50.0,61.0,41.5,50.0,3.5,0.0,0.0,29.335,30.045,10.25,7.5,10.6,4.733333,18.866667,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,2007-05-05,66.0,53.5,59.75,38.5,49.5,5.0,0.0,0.0,29.43,30.095,11.45,7.0,11.75,4.716667,18.883333,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
cleaned_weather.shape

(1472, 30)

In [0]:
cleaned_weather.to_csv('./drive/My Drive/dsi14_p4/input/cleaned_weather.csv', index=False)