In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier

In [None]:
df = pd.read_csv('/kaggle/input/did-it-rain-in-seattle-19482017/seattleWeather_1948-2017.csv', parse_dates=['DATE'])
df.head(2)

In [None]:
df.info()

In [None]:
df['rain_cat'] = df.RAIN.astype('category')

## EDA

In [None]:
sns.heatmap(df.corr(),cmap='coolwarm')

In [None]:
print('Average maximum temperature is: '+ ('%.1f'%df.TMAX.mean()) + ' F')
print('Maximum temp measured is:', ('%.1f'%max(df.TMAX)) + ' F')
print('Minumum temp measured is:', ('%.1f'%min(df.TMAX)) + ' F')
plt.figure(figsize=(15,6))
ax = sns.lineplot(data=df, x='DATE', y='TMAX', color='lightblue')
ax.set_ylabel('Temperature - degrees Fahrenheit', fontfamily='monospace', fontsize='large')
ax.set_xlabel('Years', fontfamily='monospace', fontsize='large')
plt.axhline(y=df.TMAX.mean(), color='salmon', linestyle='--')
plt.axhline(y=max(df.TMAX), color='salmon', linestyle='-')
plt.axhline(y=min(df.TMAX), color='salmon', linestyle='-')
txt_max = 'Maximum temp measured on: 29-07-2009'
plt.text(df.DATE[(df.DATE == pd.Timestamp('2000-01-01')).argmax()], 
         104, 
         txt_max, 
         c='red', 
         fontfamily='monospace')
txt_min = 'Minimum temp measured on: 20-11-1951'
plt.text(df.DATE[(df.DATE == pd.Timestamp('1952-01-01')).argmax()], 
         5,
         txt_min,
         c='blue',
         fontfamily='monospace');

In [None]:
print('Average minimum temperature is:', ('%.1f'%df.TMIN.mean()) + ' F')
print('Maximum temp measured is:', ('%.1f'%max(df.TMIN)) + ' F')
print('Minumum temp measured is:', ('%.1f'%min(df.TMIN)) + ' F')
plt.figure(figsize=(15,6))
ax = sns.lineplot(data=df, x='DATE', y='TMIN', color='lightblue')
ax.set_ylabel('Temperature - degrees Fahrenheit', fontfamily='monospace', fontsize='large')
ax.set_xlabel('Years', fontfamily='monospace', fontsize='large')
plt.axhline(y=df.TMIN.mean(), color='salmon', linestyle='--')
plt.axhline(y=max(df.TMIN), color='salmon', linestyle='-')
plt.axhline(y=min(df.TMIN), color='salmon', linestyle='-')
txt_max = 'Maximum temp measured on: 29-07-2009'
plt.text(df.DATE[(df.DATE == pd.Timestamp('2000-01-01')).argmax()], 
         72, 
         txt_max, 
         c='red', 
         fontfamily='monospace')
txt_min = 'Minimum temp measured on: 31-01-1950'
plt.text(df.DATE[(df.DATE == pd.Timestamp('1952-01-01')).argmax()], 
         1, 
         txt_min, 
         c='blue', 
         fontfamily='monospace');

In [None]:
df['year'] = df.DATE.dt.year
df['month']= df.DATE.dt.month
df['day']=df.DATE.dt.day

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x=df.month, y=df.PRCP)
plt.xlabel('Month', fontsize=14, fontfamily='monospace')
plt.ylabel('Precipitation in inches', fontsize=14, fontfamily='monospace')
plt.title('Average amount of rain per month', fontsize=18, fontfamily='monospace');

In [None]:
plt.figure(figsize=(18,6))
sns.barplot(x=df.year, y=df.PRCP)
plt.xlabel('Year', fontsize=14, fontfamily='monospace')
plt.ylabel('Precipitation in inches', fontsize=14, fontfamily='monospace')
plt.title('Average precipitation through the years', fontsize=20, fontfamily='monospace')
plt.xticks(rotation=75);

In [None]:
plt.figure(figsize=(15,6))
sns.set_theme(style="whitegrid")
sns.lineplot(x=df.year, y=df.TMIN)
sns.lineplot(x=df.year, y=df.TMAX)
plt.xlabel('Years', fontsize=14, fontfamily='monospace')
plt.ylabel('Temperature - degrees Fahrenheit', fontsize=14, fontfamily='monospace')
plt.title('Maximum and Minimum Temp through the years', fontsize=20, fontfamily='monospace');

In [None]:
plt.figure(figsize=(18,8))
sns.countplot(x=df.year, hue=df.rain_cat)
plt.xlabel('Year',fontsize=14, fontfamily='monospace')
plt.ylabel('Counts',fontsize=14, fontfamily='monospace')
plt.title('Amount of Rainy vs Dry days per year', fontsize=20, fontfamily='monospace')
plt.legend(['Dry day', 'Rainy day'])
plt.xticks(rotation=75);

## Model

In [None]:
df.dropna(inplace=True)
df.isna().sum()

**Converting the RAIN column to 1 and 0**

In [None]:
df.loc[(df.RAIN == True), 'RAIN'] = 1
df.loc[(df.RAIN == False), 'RAIN'] = 0
df.RAIN = df.RAIN.astype('int')

**X & y and train test split**

In [None]:
X = df.drop(['RAIN', 'DATE', 'rain_cat'], axis=1)
y = df.RAIN

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
cat = CatBoostClassifier(silent=True)
cat.fit(X_train, y_train)
y_preds = cat.predict(X_test)
print(classification_report(y_test, y_preds))

**Predictict whether it rained on real data 8.10.2019 (rainy day)**

In [None]:
y_pred = cat.predict([0.10,50,41,2019,10,8])
y_pred