In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import libraries
import numpy as np # linear algebra
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('/kaggle/input/weather-dataset/weatherHistory.csv')
data.shape

In [None]:
data.head()

In [None]:
df = data.copy()
df.dtypes

In [None]:
# let's correct the dtype of formatted date
df['Formatted Date'] = pd.to_datetime(df['Formatted Date'], utc=True)

In [None]:
df.dtypes

**Before moving forward, let's check for missing values**

In [None]:
df.isnull().sum()

## EDA

**CATEGORICAL FEATURES**

In [None]:
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print(f"len of categorical features {len(categorical_features)}")

In [None]:
df[categorical_features].head()

In [None]:
#drop the Daily summary column
df.drop('Daily Summary', axis=1, inplace=True)

In [None]:
print("%14s %15s %15s" %("column","Distinct","Null"))
for col in ['Summary','Precip Type']:
    uniq = df[col].nunique()
    na = df[col].isnull().sum()
    print("%14s %15s %15s" %(col,uniq,na))

In [None]:
data.groupby('Precip Type')['Temperature (C)'].mean()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='Summary',hue='Precip Type', data=df)
plt.legend(loc='upper right') #1
plt.xticks(rotation='vertical')
plt.show()

**Numerical Features**

In [None]:
numerical_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print(f"len of categorical features {len(numerical_features)}")

In [None]:
df[numerical_features].head(3)

**Now since we have been given hourly data, we need to resample it monthly. Resampling is a convenient method for frequency conversion. Object must have a datetime like index**

### After Resampling

In [None]:
df1 = df.set_index('Formatted Date')
df1 = df1[['Apparent Temperature (C)','Humidity']].resample('MS').mean()
df1

**Here, MS means Monthly Starting. we are displaying the average Apparent temperature w.r.t Humidity with help of mean().**

In [None]:
# relation between temperature and Humidity
plt.figure(figsize=(11,5))
plt.plot(df1['Apparent Temperature (C)'], label="Temperature", color="green")
plt.plot(df1['Humidity'], label="Humidity", color="blue", linestyle="dashed", linewidth=3)
plt.title("Variation Of Apparent Temperature Vs Humidity ")
plt.legend(loc="best")
plt.show()

**Observation**
- from the above graph we can say that Humidity is almost constant in these years. and temperature is also the same as peak lies on same line.

**The problem statement suggest that to check the variation of humidity of all the 10 years (2006-2016). so let;s check the graph for some months separately**

**Analysis of Variation of Temperature w.r.t Humidity from 2006 to 2016 in different months.**

In [None]:
# Jan 2006 to Jan 2016

In [None]:
jan = df1[df1.index.month==1]
jan.head(2)

In [None]:
plt.figure(figsize=(10,4))
plt.plot(jan.loc['2006-01-01':'2016-01-01','Apparent Temperature (C)'],
        marker='o',label="Apparent Temperature",linestyle='-',color="green")
plt.plot(jan.loc['2006-01-01':'2016-01-01','Humidity'],marker='o',
        linestyle='-',label="Humidity",color="blue")
plt.title("App Temp Vs Humidity variation in JAN 2006 TO JAN 2016")
plt.legend(loc="best")
plt.xticks(rotation=45)
plt.show()

In [None]:
feb = df1[df1.index.month==2]

plt.figure(figsize=(10,4))
plt.plot(feb.loc['2006-02-01':'2016-02-01','Apparent Temperature (C)'],
        marker='o',linestyle='-',label="Apparent Temperature",color="green")
plt.plot(feb.loc['2006-02-01':'2016-02-01','Humidity'],
        marker='o',linestyle='-',label="Humidity",color="blue")
plt.title("App Temp Vs Humidity variation in FEB 2006 TO FEB 2016")
plt.legend(loc="best")
plt.xticks(rotation=45)
plt.show()

In [None]:
march = df1[df1.index.month==3]

plt.figure(figsize=(10,4))
plt.plot(march.loc['2006-03-01':'2016-03-01','Apparent Temperature (C)'],
        marker='o',linestyle='-',label="Apparent Temperature",color="green")
plt.plot(march.loc['2006-03-01':'2016-03-01','Humidity'],
        marker='o',linestyle='-',label="Humidity",color="blue")
plt.title("App Temp Vs Humidity variation in MAR 2006 TO MAR 2016")
plt.legend(loc="best")
plt.xticks(rotation=45)
plt.show()

In [None]:
april = df1[df1.index.month==4]

plt.figure(figsize=(10,4))
plt.plot(april.loc['2006-04-01':'2016-04-01','Apparent Temperature (C)'],
        marker='o',linestyle='-',label="Apparent Temperature",color="green")
plt.plot(april.loc['2006-04-01':'2016-04-01','Humidity'],
        marker='o',linestyle='-',label="Humidity",color="blue")
plt.title("App Temp Vs Humidity variation in APRIL 2006 TO APRIL 2016")
plt.legend(loc="best")
plt.xticks(rotation=45)
plt.show()

In [None]:
may = df1[df1.index.month==5]

plt.figure(figsize=(10,4))
plt.plot(may.loc['2006-05-01':'2016-05-01','Apparent Temperature (C)'],
        marker='o',linestyle='-',label="Apparent Temperature",color="green")
plt.plot(may.loc['2006-05-01':'2016-05-01','Humidity'],
        marker='o',linestyle='-',label="Humidity",color="blue")
plt.title("App Temp Vs Humidity variation in MAY 2006 TO MAY 2016")
plt.legend(loc="best")
plt.xticks(rotation=45)
plt.show()

In [None]:
june = df1[df1.index.month==6]

plt.figure(figsize=(10,4))
plt.plot(march.loc['2006-06-01':'2016-06-01','Apparent Temperature (C)'],
        marker='o',linestyle='-',label="Apparent Temperature",color="green")
plt.plot(march.loc['2006-06-01':'2016-06-01','Humidity'],
        marker='o',linestyle='-',label="Humidity",color="blue")
plt.title("App Temp Vs Humidity variation in June 2006 TO June 2016")
plt.legend(loc="best")
plt.xticks(rotation=45)
plt.show()

In [None]:
july = df1[df1.index.month==6]

plt.figure(figsize=(10,4))
plt.plot(march.loc['2006-07-01':'2016-07-01','Apparent Temperature (C)'],
        marker='o',linestyle='-',label="Apparent Temperature",color="green")
plt.plot(march.loc['2006-07-01':'2016-07-01','Humidity'],
        marker='o',linestyle='-',label="Humidity",color="blue")
plt.title("App Temp Vs Humidity variation in July 2006 TO July 2016")
plt.legend(loc="best")
plt.xticks(rotation=45)
plt.show()

**OBSERVATIONS**
- With respect to humidity appraent temperature every year and every month varying a lot. sometimes it is at peak and sometimes temp fall down.
- humidity is as constant as app temp varies according to month each year.

**Let's check Wind speed w.r.t Humidity**

In [None]:
df2 = df.set_index('Formatted Date')
df2 = df2[['Temperature (C)','Wind Speed (km/h)','Humidity']].resample('MS').mean()
df2

In [None]:
janw = df2[df2.index.month == 1]

plt.figure(figsize=(10,4))
plt.plot(janw.loc['2006-01-01':'2016-01-01','Wind Speed (km/h)'],
        marker='o',linestyle='-',color="orange", label="Wind Speed (km/h)")
plt.plot(janw.loc['2006-01-01':'2016-01-01','Humidity'],
        marker='o',linestyle='-',color="purple", label="Wind Speed (km/h)")
plt.title("Variation of wind spped with humidity in JAN 2006 TO JAN 2016")
plt.legend(loc="best")
plt.show()

## THANK YOU!.