# Task for Today  

***

## Seattle Rain Prediction  

Given *data about weather in Seattle*, let's try to predict how much it will **rain** on a given day.  
  
We will use a variety of regression models to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv('../input/did-it-rain-in-seattle-19482017/seattleWeather_1948-2017.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
data.isna().sum()

In [None]:
data

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop missing rows
    df = df.dropna(axis=0).reset_index(drop=True)
    
    # Convert RAIN column to numeric
    df['RAIN'] = df['RAIN'].astype(np.int)
    
    # Extract date features
    df['DATE'] = pd.to_datetime(df['DATE'])
    
    df['YEAR'] = df['DATE'].apply(lambda x: x.year)
    df['MONTH'] = df['DATE'].apply(lambda x: x.month)
    df['DAY'] = df['DATE'].apply(lambda x: x.day)
    
    df = df.drop('DATE', axis=1)
    
    # Split df into X and y
    y = df['PRCP']
    X = df.drop('PRCP', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Predicting the PRCP column

In [None]:
models = {
    "                     Linear Regression": LinearRegression(),
    "      L2 Regularized Linear Regression": Ridge(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                        Neural Network": MLPRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),
    "                              CatBoost": CatBoostRegressor(verbose=0)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

In [None]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(model.score(X_test, y_test)))

# Examining Correlations

In [None]:
corr = pd.concat([X_train, y_train], axis=1).corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, vmin=-1.0, cmap='mako')
plt.show()

# Predicting the RAIN column

In [None]:
def preprocess_inputs_clf(df):
    df = df.copy()
    
    # Drop missing rows
    df = df.dropna(axis=0).reset_index(drop=True)
    
    # Convert RAIN column to numeric
    df['RAIN'] = df['RAIN'].astype(np.int)
    
    # Drop all features except PRCP
    df = df.drop(['DATE', 'TMAX', 'TMIN'], axis=1)
    
    # Split df into X and y
    y = df['RAIN']
    X = df.drop('RAIN', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs_clf(data)

In [None]:
X_train

In [None]:
y_train

In [None]:
clf = SVC()
clf.fit(X_train, y_train)

print("Test Accuracy: {:.2f}%".format(clf.score(X_test, y_test) * 100))

# Classifying using a simple function

In [None]:
X_test

In [None]:
y_pred = X_test > 0
y_pred = np.squeeze(np.array(y_pred, dtype=np.int))

print("Test Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/H1i7d3XMzrY