In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
filepath = '../input/weather-dataset/weatherHistory.csv'
data = pd.read_csv(filepath)
data.head()

In [None]:
data.shape

In [None]:
data.info

In [None]:
data['Loud Cover'].unique()

In [None]:
data.drop(['Loud Cover'], axis = 1, inplace = True)
data.head()

In [None]:
# now check for missing columns 
data.isna().sum()

In [None]:
#calculate missing percentage
miss_percent = (data.isna().sum().sum()/np.product(data.shape))*100
miss_percent

In [None]:
data['Precip Type'].unique()

In [None]:
# replacing nan values with the below fill method
data['Precip Type'].fillna(axis = 0, method = 'bfill', inplace= True)
data.head()

In [None]:
data.isna().sum()

In [None]:
data['Formatted Date'].head()

In [None]:
#converting time to utc format
data['Formatted Date'] = pd.to_datetime(data['Formatted Date'], utc= True)
data['Formatted Date'].head()

In [None]:
#changing index to date
data.set_index(['Formatted Date'], inplace = True)
data.head()

In [None]:
data.nunique()

In [None]:
# Setting plot style
sns.set_style(style = 'whitegrid')

In [None]:
data.describe()

In [None]:
# Plotting Weather Data
color={2:'blue',
       3:'green',
       4:'black',
       5:'red',
       6:'violet',
       7:'brown',
       8:'orange'}
for index in range(2,9):
    plt.figure(figsize=(12,5))
    plt.xlabel('Year', fontsize=12)
    plt.title('{}'.format(data.columns[index].upper()), fontsize=15)
    sns.lineplot(data=data.iloc[:,index], 
                 color=color[index], marker='o')
    plt.show()

In [None]:
data.corr()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data = data.corr(), annot = True, cmap = 'Blues')
plt.show()

In [None]:
indices = ['Temperature (C)', 'Humidity', 'Precip Type', 'Summary']
ex_data = data.loc[:,indices]
ex_data

In [None]:
#scatter plot for temperature and humidity by summary
plt.figure(figsize = (12,12))
plt.title('temperature vs humidity')
sns.scatterplot(x=ex_data.iloc[:,0], y=ex_data.iloc[:,1], hue= ex_data.iloc[:,3], s=75, alpha = 0.4)
plt.show()

In [None]:
#scatter plot for temperature and humidity by percip type

plt.figure(figsize = (12,6))
print(ex_data.describe())
plt.title('temperature vs humidity')
sns.scatterplot(x=ex_data.iloc[:,0], y=ex_data.iloc[:,1], hue= ex_data.iloc[:,2], s=50, alpha = 0.4)
plt.show()

In [None]:
#Function for plotting year-wise data
def year_plot(year):
    temp = ex_data[ex_data.index.year==year]
    plt.figure(figsize=(8,4))
    plt.title('Temperature vs Humidity for {}'.format(year))
    sns.scatterplot(x=temp.iloc[:,0], 
                    y=temp.iloc[:,1],
                    hue=temp.iloc[:,2],
                    s=50,
                    alpha = 0.3)
    plt.show()
    print(temp.describe())
    print('-'*70)

In [None]:
for year in range(2006,2017):
    year_plot(year)

**YEAR-WISE ANALYSIS**

In [None]:
#Resampling the data year-wise by mean

indices = [2,4,6,7,8]
year_data = data.iloc[:,indices].resample('Y').mean()
year_data.head()

In [None]:
year_data.describe()

In [None]:
for ind in range(len(year_data.columns)):
    sns.distplot(year_data.iloc[:,ind])
    plt.show()

In [None]:
year_data.corr()

In [None]:
plt.title('Heatmap', fontsize=15)
sns.heatmap(data=year_data.corr(), 
            annot=True, cmap='Reds')
plt.show()

In [None]:
# Pair plot for correlation
sns.pairplot(year_data, kind='reg')
plt.show()

In [None]:
#normalizing year data

normal_data = (year_data - year_data.min())/ (year_data.max() - year_data.min())
normal_data.head()

In [None]:
#Line graph representation
plt.figure(figsize=(12,6))
plt.xlabel('YEAR')
plt.ylabel('LINE PLOT', fontsize=30)
sns.lineplot(data = normal_data, marker = 's')
plt.show()

In [None]:
#Box Plot
plt.figure(figsize = (10,4))
plt.title('BOX PLOT', fontsize=30)
sns.boxplot(data = normal_data)
plt.show()

In [None]:
#Violin Plot
plt.figure(figsize = (10,4))
plt.title('VIOLIN PLOT', fontsize=30)
sns.violinplot(data=normal_data)
plt.show()

In [None]:
#Strip Plot
plt.figure(figsize = (10,4))
plt.title('STRIP PLOT', fontsize=30)
sns.stripplot(data=normal_data, jitter= True, s =18, alpha = 0.3)
plt.show()

In [None]:
# Heatmap
plt.figure(figsize=(6,8))
plt.title('HEATMAP', fontsize=25)
sns.heatmap(normal_data, annot=True, cmap='Blues',
            yticklabels=normal_data.index.year)
plt.show()

**Temperature vs Humidity**

In [None]:
data.head(1)

In [None]:
# Plot for Apparent TEmperature and Humidity by Precip Type
plt.figure(figsize=(12,12))
plt.title('Temperature vs Humidity')
sns.scatterplot(x=data.iloc[:,3],
                y=data.iloc[:,4],
               hue=data.iloc[:,1], s=75, alpha = 0.3)
plt.show()

In [None]:
#Resampling data month-wise by mean
monthly_data = data.iloc[:,3:5].resample('M').mean()
monthly_data.head()

In [None]:
monthly_data.describe()

In [None]:
#Graphical representation
plt.figure(figsize=(14,5))
plt.title('Apparent Temperature vs Humidity')
sns.lineplot(x=monthly_data.iloc[:,0],
             y=monthly_data.iloc[:,1],
             color='green')
plt.show()

In [None]:
# Regression plot for Apparent Temperature & Humidity
plt.figure(figsize=(10,5))
plt.title('Apparent Temperature vs Humidity')
sns.regplot(x=monthly_data.iloc[:,0],
            y=monthly_data.iloc[:,1])
plt.show()

In [None]:
# Distplot for Apparent Temperature & Humidity
sns.distplot(monthly_data['Apparent Temperature (C)'])
plt.show()
sns.distplot(monthly_data['Humidity'])
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.xlabel('YEAR')
plt.title('Variation of Apparent Temperature and Humidity')
sns.lineplot(data=monthly_data)
plt.show()

In [None]:
print(monthly_data.corr())
sns.pairplot(monthly_data, kind = 'scatter')
plt.show()

In [None]:
# Function for plotting Humidity & Apparent Temperature for all month
TEMP_DATA = monthly_data.iloc[:,0]
HUM_DATA = monthly_data.iloc[:,1]
def label_color(month):
    if month == 1:
        return 'January','blue'
    elif month == 2:
        return 'February','green'
    elif month == 3:
        return 'March','orange'
    elif month == 4:
        return 'April','yellow'
    elif month == 5:
        return 'May','red'
    elif month == 6:
        return 'June','violet'
    elif month == 7:
        return 'July','purple'
    elif month == 8:
        return 'August','black'
    elif month == 9:
        return 'September','brown'
    elif month == 10:
        return 'October','darkblue'
    elif month == 11:
        return 'November','grey'
    else:
        return 'December','pink'

def plot_month(month, data):
    label, color = label_color(month)
    mdata = data[data.index.month == month]
    sns.lineplot(data=mdata,
                 label=label,
                 color=color,
                 marker='o')
    
def sns_plot(title, data):
    plt.figure(figsize=(14,8))
    plt.title(title)
    plt.xlabel('YEAR')
    for i in range(1,13):
        plot_month(i,data)
    plt.show()

In [None]:
# Month-wise Plot for Apparent Temperature of 10 years 
title = 'Plot of Apparent Temperature - Month-wise' 
sns_plot(title, TEMP_DATA)

In [None]:
# Month-wise Plot for Humidity of 10 years 
title = 'Plot of Humidity - Month-wise' 
sns_plot(title, HUM_DATA)

In [None]:
# Function for plotting Apparent Temperature & Humidity for each month
def sns_month_plot(month):
    plt.figure(figsize=(10,5))
    label = label_color(month)[0]
    plt.title('Apparent Temperature & Humidity - {}'.format(label))
    plt.xlabel('YEAR')
    data = monthly_data[monthly_data.index.month == month]
    sns.lineplot(data=data, marker='o')
    plt.show()
    print(data.describe())
    sns.pairplot(data, kind='reg')
    plt.show()
    print('-'*80)

In [None]:
# plot for the month of JANUARY - DECEMBER
for month in range(1,13):
    sns_month_plot(month)

ANALYSIS ON NORMALIZED DATA

In [None]:
# Normalizing data
temp = monthly_data
normed_data = (temp - temp.min()) / (temp.max() - temp.min())
normed_data.head()

In [None]:
# Line graph representation of normalized data
plt.figure(figsize=(12,6))
plt.title('Apparent Temperature & Humidity - Normalized')
plt.xlabel('YEAR', fontsize=12)
sns.lineplot(data=normed_data)
plt.show()

In [None]:
# Correlation of variables
sns.pairplot(normed_data, kind='reg')
plt.show()

In [None]:
# Violin Plot for above graph
plt.figure(figsize=(14,5))
sns.violinplot(data=normed_data)
plt.show()

In [None]:
sns.boxplot(data=normed_data)
plt.show()

In [None]:
def sns_month_plot(month):
    plt.figure(figsize=(10,5))
    label = label_color(month)[0]
    plt.title('Apparent Temperature & Humidity - {}'.format(label))
    plt.xlabel('YEAR')
    data = normed_data[normed_data.index.month == month]
    sns.lineplot(data=data, marker='o')
    plt.show()
    sns.pairplot(data, kind='reg')
    plt.show()
    plt.title('Boxplot - {}'.format(label))
    sns.boxplot(data=data)
    plt.show()
    print('-'*80)

In [None]:
# plot for the month of JANUARY - DECEMBER
for month in range(1,13):
    sns_month_plot(month)