# Rain Prediction using Logistic Regression with Scikit Learn 


Data is imported from from [Kaggle](https://kaggle.com/datasets)





### Downloading the Data


In [None]:
!pip install opendatasets --upgrade --quiet

In [None]:
import opendatasets as od

In [None]:
od.version()

In [None]:
dataset_url = 'https://www.kaggle.com/jsphyg/weather-dataset-rattle-package'

In [None]:
od.download(dataset_url)

In [None]:
import os

In [None]:
data_dir = './weather-dataset-rattle-package'

In [None]:
os.listdir(data_dir)

In [None]:
train_csv = data_dir + '/weatherAUS.csv'


#### Loading the data from `weatherAUS.csv` using Pandas.

In [None]:
!pip install pandas --quiet

In [None]:
import pandas as pd

In [None]:
raw_df = pd.read_csv(train_csv)

In [None]:
raw_df

The dataset contains over 145,000 rows and 23 columns. The dataset contains date, numeric and categorical columns. The objective is to create a model to predict the value in the column `RainTomorrow`.



In [None]:
raw_df.info()

In [None]:
raw_df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)

In [None]:
raw_df.info()

In [None]:
!pip install plotly matplotlib seaborn --quiet

In [None]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
px.histogram(raw_df, x='Location', title='Location vs. Rainy Days', color='RainToday')

In [None]:
px.histogram(raw_df, 
             x='Temp3pm', 
             title='Temperature at 3 pm vs. Rain Tomorrow', 
             color='RainTomorrow')

In [None]:
px.histogram(raw_df, 
             x='RainTomorrow', 
             color='RainToday', 
             title='Rain Tomorrow vs. Rain Today')

In [None]:
px.scatter(raw_df.sample(2000), 
           title='Min Temp. vs Max Temp.',
           x='MinTemp', 
           y='MaxTemp', 
           color='RainToday')

In [None]:
px.strip(raw_df.sample(2000), 
           title='Temp (3 pm) vs. Humidity (3 pm)',
           x='Temp3pm',
           y='Humidity3pm',
           color='RainTomorrow')

In [None]:
raw_df.columns

In [None]:
raw_df

In [None]:
!pip install scikit-learn --upgrade --quiet

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_val_df, test_df = train_test_split(raw_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

In [None]:
print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

However, while working with dates, it's often a better idea to separate the training, validation and test sets with time, so that the model is trained on data from the past and evaluated on data from the future.

For the current dataset, we can use the Date column in the dataset to create another column for year. We'll pick the last two years for the test set, and one year before it for the validation set.

In [None]:
plt.title('No. of Rows per Year')
sns.countplot(x=pd.to_datetime(raw_df.Date).dt.year);

In [None]:
year = pd.to_datetime(raw_df.Date).dt.year

train_df = raw_df[year < 2015]
val_df = raw_df[year == 2015]
test_df = raw_df[year > 2015]

In [None]:
print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

While not a perfect 60-20-20 split, we have ensured that the test validation and test sets both contain data for all 12 months of the year.

In [None]:
train_df

In [None]:
val_df

In [None]:
test_df

In [None]:
input_cols = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'

In [None]:
print(input_cols)

In [None]:
target_col

### creating inputs and targets for the training, validation and test sets for further processing and model training.

In [None]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

In [None]:
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

In [None]:
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()

In [None]:
train_inputs

In [None]:
train_targets

### numerical and categorical colums

In [None]:
!pip install numpy --quiet

In [None]:
import numpy as np

In [None]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()[:-1]
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()
print(numeric_cols)
print(categorical_cols)

In [None]:
train_inputs[numeric_cols].describe()

In [None]:
train_inputs[categorical_cols].nunique()

### Imputing Missing Numeric Data


In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy = 'mean')

In [None]:
raw_df[numeric_cols].isna().sum()

In [None]:
train_inputs[numeric_cols].isna().sum()

In [None]:
imputer.fit(raw_df[numeric_cols])

In [None]:
list(imputer.statistics_)

### The missing values in the training, test and validation sets can now be filled in 

In [None]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols].isna().sum()

### Scaling Numeric Features


In [None]:
raw_df[numeric_cols].describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
?MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(raw_df[numeric_cols])

In [None]:
print('Minimum:')
list(scaler.data_min_)

In [None]:
print('Maximum:')
list(scaler.data_max_)

In [None]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols].describe()

### Encoding Categorical Data


In [None]:
raw_df[categorical_cols].nunique()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
?OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
encoder.fit(raw_df[categorical_cols])

In [None]:
encoder.categories_

In [None]:
encoded_cols = list(encoder.get_feature_names(categorical_cols))
print(encoded_cols)

In [None]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
test_inputs

In [None]:
print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)

In [None]:
!pip install pyarrow --quiet

In [None]:
train_inputs.to_parquet('train_inputs.parquet')
val_inputs.to_parquet('val_inputs.parquet')
test_inputs.to_parquet('test_inputs.parquet')

In [None]:
%%time
pd.DataFrame(train_targets).to_parquet('train_targets.parquet')
pd.DataFrame(val_targets).to_parquet('val_targets.parquet')
pd.DataFrame(test_targets).to_parquet('test_targets.parquet')

In [None]:
%%time

train_inputs = pd.read_parquet('train_inputs.parquet')
val_inputs = pd.read_parquet('val_inputs.parquet')
test_inputs = pd.read_parquet('test_inputs.parquet')

train_targets = pd.read_parquet('train_targets.parquet')[target_col]
val_targets = pd.read_parquet('val_targets.parquet')[target_col]
test_targets = pd.read_parquet('test_targets.parquet')[target_col]

In [None]:
print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)

In [None]:
val_inputs

In [None]:
val_targets

### Training  Logistic Regression Model



In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
?LogisticRegression

In [None]:
model = LogisticRegression(solver='liblinear')

In [None]:
model.fit(train_inputs[numeric_cols + encoded_cols], train_targets)

Let's check the weights and biases of the trained model.

In [None]:
print(numeric_cols + encoded_cols)

In [None]:
print(model.coef_.tolist())

In [None]:
print(model.intercept_)

### Predictions and Evaluating the Model



In [None]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [None]:
train_preds = model.predict(X_train)

In [None]:
train_preds

In [None]:
train_targets

In [None]:
train_probs = model.predict_proba(X_train)
train_probs

In [None]:
model.classes_

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(train_targets, train_preds)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(train_targets, train_preds, normalize='true')

In [None]:
def predict_and_plot(inputs, targets, name=''):
    preds = model.predict(inputs)
    
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    
    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure()
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));
    
    return preds

In [None]:
train_preds = predict_and_plot(X_train, train_targets, 'Training')

In [None]:
val_preds = predict_and_plot(X_val, val_targets, 'Validatiaon')

In [None]:
test_preds = predict_and_plot(X_test, test_targets, 'Test')

In [None]:
def random_guess(inputs):
    return np.random.choice(["No", "Yes"], len(inputs))

In [None]:
def all_no(inputs):
    return np.full(len(inputs), "No")

In [None]:
accuracy_score(test_targets, random_guess(X_test))

In [None]:
accuracy_score(test_targets, all_no(X_test))

### Making Predictions 



In [None]:
new_input = {'Date': '2021-06-19',
             'Location': 'Katherine',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

In [None]:
new_input_df = pd.DataFrame([new_input])

In [None]:
new_input_df

In [None]:
new_input_df[numeric_cols] = imputer.transform(new_input_df[numeric_cols])
new_input_df[numeric_cols] = scaler.transform(new_input_df[numeric_cols])
new_input_df[encoded_cols] = encoder.transform(new_input_df[categorical_cols])

In [None]:
X_new_input = new_input_df[numeric_cols + encoded_cols]
X_new_input

In [None]:
prediction = model.predict(X_new_input)[0]

In [None]:
prediction

In [None]:
prob = model.predict_proba(X_new_input)[0]

In [None]:
prob

In [None]:
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob

In [None]:
new_input = {'Date': '2021-06-19',
             'Location': 'Launceston',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

In [None]:
predict_input(new_input)

In [None]:
raw_df.Location.unique()

### Saving and Loading Trained Models


In [None]:
import joblib

In [None]:
aussie_rain = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_col': target_col,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols
}

In [None]:
joblib.dump(aussie_rain, 'aussie_rain.joblib')

In [None]:
aussie_rain2 = joblib.load('aussie_rain.joblib')

In [None]:
test_preds2 = aussie_rain2['model'].predict(X_test)
accuracy_score(test_targets, test_preds2)